Upgrade XNNPACK to b0da47a9fab5216f120c5abc3aa44a61b2dab932

Test: make
Change-Id: I2562d30aac45d2ac773e4cd647773b537a137807
diff --git a/.bazelrc b/.bazelrc
index ea28201..ec740f3 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -44,3 +44,9 @@
 build:ios_fat --config=ios
 build:ios_fat --ios_multi_cpus=armv7,arm64
 build:ios_fat --watchos_cpus=armv7k
+
+# macOS configs.
+build:macos --apple_platform_type=macos
+
+build:macos_arm64 --config=macos
+build:macos_arm64 --cpu=darwin_arm64
\ No newline at end of file
diff --git a/BUILD.bazel b/BUILD.bazel
index 2fb6a63..97d150c 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -491,8 +491,6 @@
     "src/math/sigmoid-scalar-rr2-lut64-p2-div.c",
     "src/math/sigmoid-scalar-rr2-lut2048-p1-div.c",
     "src/math/sigmoid-scalar-rr2-p5-div.c",
-    "src/qs8-gemm/gen/8x8c4-minmax-scalar.c",
-    "src/qs8-gemm/gen/12x4c4-minmax-scalar.c",
     "src/qs8-requantization/fp32-scalar-lrintf.c",
     "src/qs8-requantization/fp32-scalar-magic.c",
     "src/qs8-requantization/precise-scalar-signed64.c",
@@ -505,8 +503,6 @@
     "src/qu8-gavgpool/7p7x-minmax-scalar-c1.c",
     "src/qu8-gavgpool/7x-minmax-scalar-c1.c",
     "src/qu8-gemm/2x2-minmax-scalar.c",
-    "src/qu8-gemm/gen/8x8c4-minmax-scalar.c",
-    "src/qu8-gemm/gen/12x4c4-minmax-scalar.c",
     "src/qu8-igemm/2x2-minmax-scalar.c",
     "src/qu8-requantization/fp32-scalar-lrintf.c",
     "src/qu8-requantization/fp32-scalar-magic.c",
@@ -818,14 +814,6 @@
     "src/f32-dwconv2d-chw/gen/3x3s2p1-minmax-wasmsimd-arm-loadsplat-2x4.c",
     "src/f32-dwconv2d-chw/gen/3x3s2p1-minmax-wasmsimd-arm-loadsplat-3x4.c",
     "src/f32-dwconv2d-chw/gen/3x3s2p1-minmax-wasmsimd-arm-loadsplat-4x4.c",
-    "src/f32-dwconv2d-chw/gen/3x3s2p1-minmax-wasmsimd-x86-loadsplat-1x4-acc2.c",
-    "src/f32-dwconv2d-chw/gen/3x3s2p1-minmax-wasmsimd-x86-loadsplat-1x4-acc3.c",
-    "src/f32-dwconv2d-chw/gen/3x3s2p1-minmax-wasmsimd-x86-loadsplat-1x4-acc4.c",
-    "src/f32-dwconv2d-chw/gen/3x3s2p1-minmax-wasmsimd-x86-loadsplat-1x4.c",
-    "src/f32-dwconv2d-chw/gen/3x3s2p1-minmax-wasmsimd-x86-loadsplat-2x4-acc2.c",
-    "src/f32-dwconv2d-chw/gen/3x3s2p1-minmax-wasmsimd-x86-loadsplat-2x4.c",
-    "src/f32-dwconv2d-chw/gen/3x3s2p1-minmax-wasmsimd-x86-loadsplat-3x4.c",
-    "src/f32-dwconv2d-chw/gen/3x3s2p1-minmax-wasmsimd-x86-loadsplat-4x4.c",
     "src/f32-dwconv2d-chw/gen/3x3s2p1-minmax-wasmsimd-arm-splat-1x4-acc2.c",
     "src/f32-dwconv2d-chw/gen/3x3s2p1-minmax-wasmsimd-arm-splat-1x4-acc3.c",
     "src/f32-dwconv2d-chw/gen/3x3s2p1-minmax-wasmsimd-arm-splat-1x4-acc4.c",
@@ -834,6 +822,14 @@
     "src/f32-dwconv2d-chw/gen/3x3s2p1-minmax-wasmsimd-arm-splat-2x4.c",
     "src/f32-dwconv2d-chw/gen/3x3s2p1-minmax-wasmsimd-arm-splat-3x4.c",
     "src/f32-dwconv2d-chw/gen/3x3s2p1-minmax-wasmsimd-arm-splat-4x4.c",
+    "src/f32-dwconv2d-chw/gen/3x3s2p1-minmax-wasmsimd-x86-loadsplat-1x4-acc2.c",
+    "src/f32-dwconv2d-chw/gen/3x3s2p1-minmax-wasmsimd-x86-loadsplat-1x4-acc3.c",
+    "src/f32-dwconv2d-chw/gen/3x3s2p1-minmax-wasmsimd-x86-loadsplat-1x4-acc4.c",
+    "src/f32-dwconv2d-chw/gen/3x3s2p1-minmax-wasmsimd-x86-loadsplat-1x4.c",
+    "src/f32-dwconv2d-chw/gen/3x3s2p1-minmax-wasmsimd-x86-loadsplat-2x4-acc2.c",
+    "src/f32-dwconv2d-chw/gen/3x3s2p1-minmax-wasmsimd-x86-loadsplat-2x4.c",
+    "src/f32-dwconv2d-chw/gen/3x3s2p1-minmax-wasmsimd-x86-loadsplat-3x4.c",
+    "src/f32-dwconv2d-chw/gen/3x3s2p1-minmax-wasmsimd-x86-loadsplat-4x4.c",
     "src/f32-dwconv2d-chw/gen/3x3s2p1-minmax-wasmsimd-x86-splat-1x4-acc2.c",
     "src/f32-dwconv2d-chw/gen/3x3s2p1-minmax-wasmsimd-x86-splat-1x4-acc3.c",
     "src/f32-dwconv2d-chw/gen/3x3s2p1-minmax-wasmsimd-x86-splat-1x4-acc4.c",
@@ -1523,6 +1519,8 @@
     "src/f32-hswish/gen/hswish-neon-x4.c",
     "src/f32-hswish/gen/hswish-neon-x8.c",
     "src/f32-hswish/gen/hswish-neon-x16.c",
+    "src/f32-ibilinear-chw/gen/neon-p4.c",
+    "src/f32-ibilinear-chw/gen/neon-p8.c",
     "src/f32-ibilinear/gen/neon-c4.c",
     "src/f32-ibilinear/gen/neon-c8.c",
     "src/f32-igemm/gen/1x8-minmax-neon-dup-ld64.c",
@@ -1697,13 +1695,117 @@
     "src/qs8-gavgpool/gen/7x-minmax-neon-c24-acc2.c",
     "src/qs8-gavgpool/gen/7x-minmax-neon-c32-acc2.c",
     "src/qs8-gemm/gen/1x8-minmax-neon-mlal-lane.c",
+    "src/qs8-gemm/gen/1x8-minmax-neon-mull-addw-dup.c",
+    "src/qs8-gemm/gen/1x8c2-minmax-neon-mlal-padal-dup.c",
+    "src/qs8-gemm/gen/1x8c2-minmax-neon-mull-padal-dup.c",
+    "src/qs8-gemm/gen/1x8c8-minmax-neon-mlal-padal.c",
+    "src/qs8-gemm/gen/1x8c8-minmax-neon-mull-padal.c",
+    "src/qs8-gemm/gen/1x8c16-minmax-neon-mlal-padal.c",
     "src/qs8-gemm/gen/1x16-minmax-neon-mlal-lane.c",
+    "src/qs8-gemm/gen/1x16-minmax-neon-mull-addw-dup.c",
+    "src/qs8-gemm/gen/1x16c2-minmax-neon-mlal-padal-dup.c",
+    "src/qs8-gemm/gen/1x16c2-minmax-neon-mull-padal-dup.c",
+    "src/qs8-gemm/gen/1x16c8-minmax-neon-mlal-padal.c",
+    "src/qs8-gemm/gen/1x16c8-minmax-neon-mull-padal.c",
+    "src/qs8-gemm/gen/1x16c16-minmax-neon-mlal-padal.c",
     "src/qs8-gemm/gen/2x8-minmax-neon-mlal-lane.c",
+    "src/qs8-gemm/gen/2x8-minmax-neon-mull-addw-dup.c",
+    "src/qs8-gemm/gen/2x8c2-minmax-neon-mlal-padal-dup.c",
+    "src/qs8-gemm/gen/2x8c2-minmax-neon-mull-padal-dup.c",
+    "src/qs8-gemm/gen/2x8c8-minmax-neon-mlal-padal.c",
+    "src/qs8-gemm/gen/2x8c8-minmax-neon-mull-padal.c",
+    "src/qs8-gemm/gen/2x8c16-minmax-neon-mlal-padal.c",
     "src/qs8-gemm/gen/2x16-minmax-neon-mlal-lane.c",
+    "src/qs8-gemm/gen/2x16-minmax-neon-mull-addw-dup.c",
+    "src/qs8-gemm/gen/2x16c2-minmax-neon-mlal-padal-dup.c",
+    "src/qs8-gemm/gen/2x16c2-minmax-neon-mull-padal-dup.c",
+    "src/qs8-gemm/gen/2x16c8-minmax-neon-mlal-padal.c",
+    "src/qs8-gemm/gen/2x16c8-minmax-neon-mull-padal.c",
+    "src/qs8-gemm/gen/2x16c16-minmax-neon-mlal-padal.c",
+    "src/qs8-gemm/gen/3x8-minmax-neon-mlal-lane.c",
+    "src/qs8-gemm/gen/3x8-minmax-neon-mull-addw-dup.c",
+    "src/qs8-gemm/gen/3x8c2-minmax-neon-mlal-padal-dup.c",
+    "src/qs8-gemm/gen/3x8c2-minmax-neon-mull-padal-dup.c",
+    "src/qs8-gemm/gen/3x8c8-minmax-neon-mlal-padal.c",
+    "src/qs8-gemm/gen/3x8c8-minmax-neon-mull-padal.c",
+    "src/qs8-gemm/gen/3x8c16-minmax-neon-mlal-padal.c",
+    "src/qs8-gemm/gen/3x16-minmax-neon-mlal-lane.c",
+    "src/qs8-gemm/gen/3x16-minmax-neon-mull-addw-dup.c",
+    "src/qs8-gemm/gen/3x16c2-minmax-neon-mlal-padal-dup.c",
+    "src/qs8-gemm/gen/3x16c2-minmax-neon-mull-padal-dup.c",
+    "src/qs8-gemm/gen/3x16c8-minmax-neon-mlal-padal.c",
+    "src/qs8-gemm/gen/3x16c8-minmax-neon-mull-padal.c",
+    "src/qs8-gemm/gen/3x16c16-minmax-neon-mlal-padal.c",
+    "src/qs8-gemm/gen/4x8-minmax-neon-mlal-lane.c",
+    "src/qs8-gemm/gen/4x8-minmax-neon-mull-addw-dup.c",
+    "src/qs8-gemm/gen/4x8c2-minmax-neon-mlal-padal-dup.c",
+    "src/qs8-gemm/gen/4x8c2-minmax-neon-mull-padal-dup.c",
+    "src/qs8-gemm/gen/4x8c8-minmax-neon-mlal-padal.c",
+    "src/qs8-gemm/gen/4x8c8-minmax-neon-mull-padal.c",
+    "src/qs8-gemm/gen/4x8c16-minmax-neon-mlal-padal.c",
+    "src/qs8-gemm/gen/4x16-minmax-neon-mlal-lane.c",
+    "src/qs8-gemm/gen/4x16-minmax-neon-mull-addw-dup.c",
+    "src/qs8-gemm/gen/4x16c2-minmax-neon-mlal-padal-dup.c",
+    "src/qs8-gemm/gen/4x16c2-minmax-neon-mull-padal-dup.c",
+    "src/qs8-gemm/gen/4x16c8-minmax-neon-mlal-padal.c",
+    "src/qs8-gemm/gen/4x16c8-minmax-neon-mull-padal.c",
+    "src/qs8-gemm/gen/4x16c16-minmax-neon-mlal-padal.c",
     "src/qs8-igemm/gen/1x8-minmax-neon-mlal-lane.c",
+    "src/qs8-igemm/gen/1x8-minmax-neon-mull-addw-dup.c",
+    "src/qs8-igemm/gen/1x8c2-minmax-neon-mlal-padal-dup.c",
+    "src/qs8-igemm/gen/1x8c2-minmax-neon-mull-padal-dup.c",
+    "src/qs8-igemm/gen/1x8c8-minmax-neon-mlal-padal.c",
+    "src/qs8-igemm/gen/1x8c8-minmax-neon-mull-padal.c",
+    "src/qs8-igemm/gen/1x8c16-minmax-neon-mlal-padal.c",
     "src/qs8-igemm/gen/1x16-minmax-neon-mlal-lane.c",
+    "src/qs8-igemm/gen/1x16-minmax-neon-mull-addw-dup.c",
+    "src/qs8-igemm/gen/1x16c2-minmax-neon-mlal-padal-dup.c",
+    "src/qs8-igemm/gen/1x16c2-minmax-neon-mull-padal-dup.c",
+    "src/qs8-igemm/gen/1x16c8-minmax-neon-mlal-padal.c",
+    "src/qs8-igemm/gen/1x16c8-minmax-neon-mull-padal.c",
+    "src/qs8-igemm/gen/1x16c16-minmax-neon-mlal-padal.c",
     "src/qs8-igemm/gen/2x8-minmax-neon-mlal-lane.c",
+    "src/qs8-igemm/gen/2x8-minmax-neon-mull-addw-dup.c",
+    "src/qs8-igemm/gen/2x8c2-minmax-neon-mlal-padal-dup.c",
+    "src/qs8-igemm/gen/2x8c2-minmax-neon-mull-padal-dup.c",
+    "src/qs8-igemm/gen/2x8c8-minmax-neon-mlal-padal.c",
+    "src/qs8-igemm/gen/2x8c8-minmax-neon-mull-padal.c",
+    "src/qs8-igemm/gen/2x8c16-minmax-neon-mlal-padal.c",
     "src/qs8-igemm/gen/2x16-minmax-neon-mlal-lane.c",
+    "src/qs8-igemm/gen/2x16-minmax-neon-mull-addw-dup.c",
+    "src/qs8-igemm/gen/2x16c2-minmax-neon-mlal-padal-dup.c",
+    "src/qs8-igemm/gen/2x16c2-minmax-neon-mull-padal-dup.c",
+    "src/qs8-igemm/gen/2x16c8-minmax-neon-mlal-padal.c",
+    "src/qs8-igemm/gen/2x16c8-minmax-neon-mull-padal.c",
+    "src/qs8-igemm/gen/2x16c16-minmax-neon-mlal-padal.c",
+    "src/qs8-igemm/gen/3x8-minmax-neon-mlal-lane.c",
+    "src/qs8-igemm/gen/3x8-minmax-neon-mull-addw-dup.c",
+    "src/qs8-igemm/gen/3x8c2-minmax-neon-mlal-padal-dup.c",
+    "src/qs8-igemm/gen/3x8c2-minmax-neon-mull-padal-dup.c",
+    "src/qs8-igemm/gen/3x8c8-minmax-neon-mlal-padal.c",
+    "src/qs8-igemm/gen/3x8c8-minmax-neon-mull-padal.c",
+    "src/qs8-igemm/gen/3x8c16-minmax-neon-mlal-padal.c",
+    "src/qs8-igemm/gen/3x16-minmax-neon-mlal-lane.c",
+    "src/qs8-igemm/gen/3x16-minmax-neon-mull-addw-dup.c",
+    "src/qs8-igemm/gen/3x16c2-minmax-neon-mlal-padal-dup.c",
+    "src/qs8-igemm/gen/3x16c2-minmax-neon-mull-padal-dup.c",
+    "src/qs8-igemm/gen/3x16c8-minmax-neon-mlal-padal.c",
+    "src/qs8-igemm/gen/3x16c8-minmax-neon-mull-padal.c",
+    "src/qs8-igemm/gen/3x16c16-minmax-neon-mlal-padal.c",
+    "src/qs8-igemm/gen/4x8-minmax-neon-mlal-lane.c",
+    "src/qs8-igemm/gen/4x8-minmax-neon-mull-addw-dup.c",
+    "src/qs8-igemm/gen/4x8c2-minmax-neon-mlal-padal-dup.c",
+    "src/qs8-igemm/gen/4x8c2-minmax-neon-mull-padal-dup.c",
+    "src/qs8-igemm/gen/4x8c8-minmax-neon-mlal-padal.c",
+    "src/qs8-igemm/gen/4x8c8-minmax-neon-mull-padal.c",
+    "src/qs8-igemm/gen/4x8c16-minmax-neon-mlal-padal.c",
+    "src/qs8-igemm/gen/4x16-minmax-neon-mlal-lane.c",
+    "src/qs8-igemm/gen/4x16-minmax-neon-mull-addw-dup.c",
+    "src/qs8-igemm/gen/4x16c2-minmax-neon-mlal-padal-dup.c",
+    "src/qs8-igemm/gen/4x16c2-minmax-neon-mull-padal-dup.c",
+    "src/qs8-igemm/gen/4x16c8-minmax-neon-mlal-padal.c",
+    "src/qs8-igemm/gen/4x16c8-minmax-neon-mull-padal.c",
+    "src/qs8-igemm/gen/4x16c16-minmax-neon-mlal-padal.c",
     "src/qs8-requantization/fp32-neon.c",
     "src/qs8-requantization/precise-neon.c",
     "src/qs8-requantization/q31-neon.c",
@@ -1776,6 +1878,8 @@
     "src/f32-gemm/gen/6x8-minmax-neonfma-dup-ld128.c",
     "src/f32-gemm/gen/6x8s4-minmax-neonfma.c",
     "src/f32-gemm/gen/8x8s4-minmax-neonfma.c",
+    "src/f32-ibilinear-chw/gen/neonfma-p4.c",
+    "src/f32-ibilinear-chw/gen/neonfma-p8.c",
     "src/f32-ibilinear/gen/neonfma-c4.c",
     "src/f32-ibilinear/gen/neonfma-c8.c",
     "src/f32-igemm/gen/1x8-minmax-neonfma-dup-ld64.c",
@@ -2172,7 +2276,6 @@
     "src/qs8-gemm/gen/6x16c4-minmax-neondot.c",
     "src/qs8-gemm/gen/8x8c4-minmax-neondot.c",
     "src/qs8-gemm/gen/8x16c4-minmax-neondot.c",
-    "src/qs8-gemm/gen/12x8c4-minmax-neondot.c",
     "src/qs8-igemm/gen/1x8c4-minmax-neondot.c",
     "src/qs8-igemm/gen/1x16c4-minmax-neondot.c",
     "src/qs8-igemm/gen/4x8c4-minmax-neondot.c",
@@ -2181,7 +2284,6 @@
     "src/qs8-igemm/gen/6x16c4-minmax-neondot.c",
     "src/qs8-igemm/gen/8x8c4-minmax-neondot.c",
     "src/qs8-igemm/gen/8x16c4-minmax-neondot.c",
-    "src/qs8-igemm/gen/12x8c4-minmax-neondot.c",
 ]
 
 SSE_UKERNELS = [
@@ -3460,9 +3562,18 @@
     "src/f32-igemm/gen/5x8-minmax-aarch64-neonfma-cortex-a75.S",
     "src/f32-igemm/gen/6x8-minmax-aarch64-neonfma-cortex-a57.S",
     "src/f32-igemm/gen/6x8-minmax-aarch64-neonfma-cortex-a75.S",
+    "src/qs8-gemm/1x16c4-aarch64-neondot-ld32.S",
     "src/qs8-gemm/1x16c4-aarch64-neondot-ld64.S",
+    "src/qs8-gemm/2x8c8-aarch64-neon-mull-padal.S",
+    "src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal.S",
+    "src/qs8-gemm/2x8c16-aarch64-neon-mlal-padal.S",
     "src/qs8-gemm/4x16c4-aarch64-neondot-cortex-a55.S",
+    "src/qs8-gemm/4x16c4-aarch64-neondot-ld32.S",
     "src/qs8-gemm/4x16c4-aarch64-neondot-ld64.S",
+    "src/qs8-igemm/2x8c8-aarch64-neon-mlal-padal.S",
+    "src/qs8-igemm/2x8c16-aarch64-neon-mlal-padal.S",
+    "src/qs8-igemm/4x16c4-aarch64-neondot-cortex-a55.S",
+    "src/qs8-igemm/4x16c4-aarch64-neondot-ld64.S",
 ]
 
 INTERNAL_MICROKERNEL_HDRS = [
@@ -5228,6 +5339,18 @@
 )
 
 cc_library(
+    name = "fp32_sparse_mobilenet_v1",
+    srcs = ["models/fp32-sparse-mobilenet-v1.cc"],
+    hdrs = ["models/models.h"],
+    copts = xnnpack_std_cxxopts(),
+    linkstatic = True,
+    deps = [
+        ":XNNPACK",
+        "@pthreadpool",
+    ],
+)
+
+cc_library(
     name = "fp16_mobilenet_v1",
     srcs = ["models/fp16-mobilenet-v1.cc"],
     hdrs = ["models/models.h"],
@@ -5265,6 +5388,18 @@
 )
 
 cc_library(
+    name = "qu8_mobilenet_v1",
+    srcs = ["models/qu8-mobilenet-v1.cc"],
+    hdrs = ["models/models.h"],
+    copts = xnnpack_std_cxxopts(),
+    linkstatic = True,
+    deps = [
+        ":XNNPACK",
+        "@pthreadpool",
+    ],
+)
+
+cc_library(
     name = "fp32_mobilenet_v2",
     srcs = ["models/fp32-mobilenet-v2.cc"],
     hdrs = ["models/models.h"],
@@ -5277,6 +5412,18 @@
 )
 
 cc_library(
+    name = "fp32_sparse_mobilenet_v2",
+    srcs = ["models/fp32-sparse-mobilenet-v2.cc"],
+    hdrs = ["models/models.h"],
+    copts = xnnpack_std_cxxopts(),
+    linkstatic = True,
+    deps = [
+        ":XNNPACK",
+        "@pthreadpool",
+    ],
+)
+
+cc_library(
     name = "fp16_mobilenet_v2",
     srcs = ["models/fp16-mobilenet-v2.cc"],
     hdrs = ["models/models.h"],
@@ -5302,6 +5449,18 @@
 )
 
 cc_library(
+    name = "fp32_sparse_mobilenet_v3_large",
+    srcs = ["models/fp32-sparse-mobilenet-v3-large.cc"],
+    hdrs = ["models/models.h"],
+    copts = xnnpack_std_cxxopts(),
+    linkstatic = True,
+    deps = [
+        ":XNNPACK",
+        "@pthreadpool",
+    ],
+)
+
+cc_library(
     name = "fp16_mobilenet_v3_large",
     srcs = ["models/fp16-mobilenet-v3-large.cc"],
     hdrs = ["models/models.h"],
@@ -5327,6 +5486,18 @@
 )
 
 cc_library(
+    name = "fp32_sparse_mobilenet_v3_small",
+    srcs = ["models/fp32-sparse-mobilenet-v3-small.cc"],
+    hdrs = ["models/models.h"],
+    copts = xnnpack_std_cxxopts(),
+    linkstatic = True,
+    deps = [
+        ":XNNPACK",
+        "@pthreadpool",
+    ],
+)
+
+cc_library(
     name = "fp16_mobilenet_v3_small",
     srcs = ["models/fp16-mobilenet-v3-small.cc"],
     hdrs = ["models/models.h"],
@@ -5370,6 +5541,19 @@
 )
 
 xnnpack_benchmark(
+    name = "qs8_gemm_e2e_bench",
+    srcs = [
+        "bench/qs8-gemm-e2e.cc",
+        "bench/end2end.h",
+    ] + MICROKERNEL_BENCHMARK_HDRS,
+    deps = MICROKERNEL_BENCHMARK_DEPS + [
+        ":XNNPACK",
+        ":qs8_mobilenet_v1",
+        ":qs8_mobilenet_v2",
+    ],
+)
+
+xnnpack_benchmark(
     name = "end2end_bench",
     srcs = ["bench/end2end.cc"],
     deps = [
@@ -5383,8 +5567,13 @@
         ":fp32_mobilenet_v2",
         ":fp32_mobilenet_v3_large",
         ":fp32_mobilenet_v3_small",
+        ":fp32_sparse_mobilenet_v1",
+        ":fp32_sparse_mobilenet_v2",
+        ":fp32_sparse_mobilenet_v3_large",
+        ":fp32_sparse_mobilenet_v3_small",
         ":qs8_mobilenet_v1",
         ":qs8_mobilenet_v2",
+        ":qu8_mobilenet_v1",
         "@pthreadpool",
     ],
 )
@@ -6783,6 +6972,7 @@
 
 xnnpack_unit_test(
     name = "add_nd_test",
+    timeout = "moderate",
     srcs = [
         "test/add-nd.cc",
         "test/binary-elementwise-operator-tester.h",
@@ -6855,6 +7045,7 @@
 
 xnnpack_unit_test(
     name = "convolution_nhwc_test",
+    timeout = "moderate",
     srcs = [
         "test/convolution-nhwc.cc",
         "test/convolution-operator-tester.h",
@@ -6864,6 +7055,7 @@
 
 xnnpack_unit_test(
     name = "convolution_nchw_test",
+    timeout = "moderate",
     srcs = [
         "test/convolution-nchw.cc",
         "test/convolution-operator-tester.h",
@@ -6982,6 +7174,7 @@
 
 xnnpack_unit_test(
     name = "max_pooling_nhwc_test",
+    timeout = "moderate",
     srcs = [
         "test/max-pooling-nhwc.cc",
         "test/max-pooling-operator-tester.h",
@@ -7314,14 +7507,22 @@
 )
 
 config_setting(
+    name = "macos_arm64",
+    values = {
+        "apple_platform_type": "macos",
+        "cpu": "darwin_arm64",
+    },
+)
+
+config_setting(
     name = "emscripten",
-    values = {"crosstool_top": "//toolchain:emscripten"},
+    values = {"crosstool_top": "//emscripten_toolchain:everything"},
 )
 
 config_setting(
     name = "emscripten_wasm",
     values = {
-        "crosstool_top": "//toolchain:emscripten",
+        "crosstool_top": "//emscripten_toolchain:everything",
         "cpu": "wasm",
     },
 )
@@ -7329,7 +7530,7 @@
 config_setting(
     name = "emscripten_wasmsimd",
     values = {
-        "crosstool_top": "//toolchain:emscripten",
+        "crosstool_top": "//emscripten_toolchain:everything",
         "cpu": "wasm",
         "copt": "-msimd128",
     },
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a81cfc6..be0132e 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -617,8 +617,6 @@
   src/math/sigmoid-scalar-rr2-lut64-p2-div.c
   src/math/sigmoid-scalar-rr2-lut2048-p1-div.c
   src/math/sigmoid-scalar-rr2-p5-div.c
-  src/qs8-gemm/gen/8x8c4-minmax-scalar.c
-  src/qs8-gemm/gen/12x4c4-minmax-scalar.c
   src/qs8-requantization/fp32-scalar-lrintf.c
   src/qs8-requantization/fp32-scalar-magic.c
   src/qs8-requantization/precise-scalar-signed64.c
@@ -631,8 +629,6 @@
   src/qu8-gavgpool/7p7x-minmax-scalar-c1.c
   src/qu8-gavgpool/7x-minmax-scalar-c1.c
   src/qu8-gemm/2x2-minmax-scalar.c
-  src/qu8-gemm/gen/8x8c4-minmax-scalar.c
-  src/qu8-gemm/gen/12x4c4-minmax-scalar.c
   src/qu8-igemm/2x2-minmax-scalar.c
   src/qu8-requantization/fp32-scalar-lrintf.c
   src/qu8-requantization/fp32-scalar-magic.c
@@ -772,6 +768,8 @@
   src/f32-hswish/gen/hswish-neon-x4.c
   src/f32-hswish/gen/hswish-neon-x8.c
   src/f32-hswish/gen/hswish-neon-x16.c
+  src/f32-ibilinear-chw/gen/neon-p4.c
+  src/f32-ibilinear-chw/gen/neon-p8.c
   src/f32-ibilinear/gen/neon-c4.c
   src/f32-ibilinear/gen/neon-c8.c
   src/f32-igemm/gen/1x8-minmax-neon-dup-ld64.c
@@ -946,13 +944,117 @@
   src/qs8-gavgpool/gen/7x-minmax-neon-c24-acc2.c
   src/qs8-gavgpool/gen/7x-minmax-neon-c32-acc2.c
   src/qs8-gemm/gen/1x8-minmax-neon-mlal-lane.c
+  src/qs8-gemm/gen/1x8-minmax-neon-mull-addw-dup.c
+  src/qs8-gemm/gen/1x8c2-minmax-neon-mlal-padal-dup.c
+  src/qs8-gemm/gen/1x8c2-minmax-neon-mull-padal-dup.c
+  src/qs8-gemm/gen/1x8c8-minmax-neon-mlal-padal.c
+  src/qs8-gemm/gen/1x8c8-minmax-neon-mull-padal.c
+  src/qs8-gemm/gen/1x8c16-minmax-neon-mlal-padal.c
   src/qs8-gemm/gen/1x16-minmax-neon-mlal-lane.c
+  src/qs8-gemm/gen/1x16-minmax-neon-mull-addw-dup.c
+  src/qs8-gemm/gen/1x16c2-minmax-neon-mlal-padal-dup.c
+  src/qs8-gemm/gen/1x16c2-minmax-neon-mull-padal-dup.c
+  src/qs8-gemm/gen/1x16c8-minmax-neon-mlal-padal.c
+  src/qs8-gemm/gen/1x16c8-minmax-neon-mull-padal.c
+  src/qs8-gemm/gen/1x16c16-minmax-neon-mlal-padal.c
   src/qs8-gemm/gen/2x8-minmax-neon-mlal-lane.c
+  src/qs8-gemm/gen/2x8-minmax-neon-mull-addw-dup.c
+  src/qs8-gemm/gen/2x8c2-minmax-neon-mlal-padal-dup.c
+  src/qs8-gemm/gen/2x8c2-minmax-neon-mull-padal-dup.c
+  src/qs8-gemm/gen/2x8c8-minmax-neon-mlal-padal.c
+  src/qs8-gemm/gen/2x8c8-minmax-neon-mull-padal.c
+  src/qs8-gemm/gen/2x8c16-minmax-neon-mlal-padal.c
   src/qs8-gemm/gen/2x16-minmax-neon-mlal-lane.c
+  src/qs8-gemm/gen/2x16-minmax-neon-mull-addw-dup.c
+  src/qs8-gemm/gen/2x16c2-minmax-neon-mlal-padal-dup.c
+  src/qs8-gemm/gen/2x16c2-minmax-neon-mull-padal-dup.c
+  src/qs8-gemm/gen/2x16c8-minmax-neon-mlal-padal.c
+  src/qs8-gemm/gen/2x16c8-minmax-neon-mull-padal.c
+  src/qs8-gemm/gen/2x16c16-minmax-neon-mlal-padal.c
+  src/qs8-gemm/gen/3x8-minmax-neon-mlal-lane.c
+  src/qs8-gemm/gen/3x8-minmax-neon-mull-addw-dup.c
+  src/qs8-gemm/gen/3x8c2-minmax-neon-mlal-padal-dup.c
+  src/qs8-gemm/gen/3x8c2-minmax-neon-mull-padal-dup.c
+  src/qs8-gemm/gen/3x8c8-minmax-neon-mlal-padal.c
+  src/qs8-gemm/gen/3x8c8-minmax-neon-mull-padal.c
+  src/qs8-gemm/gen/3x8c16-minmax-neon-mlal-padal.c
+  src/qs8-gemm/gen/3x16-minmax-neon-mlal-lane.c
+  src/qs8-gemm/gen/3x16-minmax-neon-mull-addw-dup.c
+  src/qs8-gemm/gen/3x16c2-minmax-neon-mlal-padal-dup.c
+  src/qs8-gemm/gen/3x16c2-minmax-neon-mull-padal-dup.c
+  src/qs8-gemm/gen/3x16c8-minmax-neon-mlal-padal.c
+  src/qs8-gemm/gen/3x16c8-minmax-neon-mull-padal.c
+  src/qs8-gemm/gen/3x16c16-minmax-neon-mlal-padal.c
+  src/qs8-gemm/gen/4x8-minmax-neon-mlal-lane.c
+  src/qs8-gemm/gen/4x8-minmax-neon-mull-addw-dup.c
+  src/qs8-gemm/gen/4x8c2-minmax-neon-mlal-padal-dup.c
+  src/qs8-gemm/gen/4x8c2-minmax-neon-mull-padal-dup.c
+  src/qs8-gemm/gen/4x8c8-minmax-neon-mlal-padal.c
+  src/qs8-gemm/gen/4x8c8-minmax-neon-mull-padal.c
+  src/qs8-gemm/gen/4x8c16-minmax-neon-mlal-padal.c
+  src/qs8-gemm/gen/4x16-minmax-neon-mlal-lane.c
+  src/qs8-gemm/gen/4x16-minmax-neon-mull-addw-dup.c
+  src/qs8-gemm/gen/4x16c2-minmax-neon-mlal-padal-dup.c
+  src/qs8-gemm/gen/4x16c2-minmax-neon-mull-padal-dup.c
+  src/qs8-gemm/gen/4x16c8-minmax-neon-mlal-padal.c
+  src/qs8-gemm/gen/4x16c8-minmax-neon-mull-padal.c
+  src/qs8-gemm/gen/4x16c16-minmax-neon-mlal-padal.c
   src/qs8-igemm/gen/1x8-minmax-neon-mlal-lane.c
+  src/qs8-igemm/gen/1x8-minmax-neon-mull-addw-dup.c
+  src/qs8-igemm/gen/1x8c2-minmax-neon-mlal-padal-dup.c
+  src/qs8-igemm/gen/1x8c2-minmax-neon-mull-padal-dup.c
+  src/qs8-igemm/gen/1x8c8-minmax-neon-mlal-padal.c
+  src/qs8-igemm/gen/1x8c8-minmax-neon-mull-padal.c
+  src/qs8-igemm/gen/1x8c16-minmax-neon-mlal-padal.c
   src/qs8-igemm/gen/1x16-minmax-neon-mlal-lane.c
+  src/qs8-igemm/gen/1x16-minmax-neon-mull-addw-dup.c
+  src/qs8-igemm/gen/1x16c2-minmax-neon-mlal-padal-dup.c
+  src/qs8-igemm/gen/1x16c2-minmax-neon-mull-padal-dup.c
+  src/qs8-igemm/gen/1x16c8-minmax-neon-mlal-padal.c
+  src/qs8-igemm/gen/1x16c8-minmax-neon-mull-padal.c
+  src/qs8-igemm/gen/1x16c16-minmax-neon-mlal-padal.c
   src/qs8-igemm/gen/2x8-minmax-neon-mlal-lane.c
+  src/qs8-igemm/gen/2x8-minmax-neon-mull-addw-dup.c
+  src/qs8-igemm/gen/2x8c2-minmax-neon-mlal-padal-dup.c
+  src/qs8-igemm/gen/2x8c2-minmax-neon-mull-padal-dup.c
+  src/qs8-igemm/gen/2x8c8-minmax-neon-mlal-padal.c
+  src/qs8-igemm/gen/2x8c8-minmax-neon-mull-padal.c
+  src/qs8-igemm/gen/2x8c16-minmax-neon-mlal-padal.c
   src/qs8-igemm/gen/2x16-minmax-neon-mlal-lane.c
+  src/qs8-igemm/gen/2x16-minmax-neon-mull-addw-dup.c
+  src/qs8-igemm/gen/2x16c2-minmax-neon-mlal-padal-dup.c
+  src/qs8-igemm/gen/2x16c2-minmax-neon-mull-padal-dup.c
+  src/qs8-igemm/gen/2x16c8-minmax-neon-mlal-padal.c
+  src/qs8-igemm/gen/2x16c8-minmax-neon-mull-padal.c
+  src/qs8-igemm/gen/2x16c16-minmax-neon-mlal-padal.c
+  src/qs8-igemm/gen/3x8-minmax-neon-mlal-lane.c
+  src/qs8-igemm/gen/3x8-minmax-neon-mull-addw-dup.c
+  src/qs8-igemm/gen/3x8c2-minmax-neon-mlal-padal-dup.c
+  src/qs8-igemm/gen/3x8c2-minmax-neon-mull-padal-dup.c
+  src/qs8-igemm/gen/3x8c8-minmax-neon-mlal-padal.c
+  src/qs8-igemm/gen/3x8c8-minmax-neon-mull-padal.c
+  src/qs8-igemm/gen/3x8c16-minmax-neon-mlal-padal.c
+  src/qs8-igemm/gen/3x16-minmax-neon-mlal-lane.c
+  src/qs8-igemm/gen/3x16-minmax-neon-mull-addw-dup.c
+  src/qs8-igemm/gen/3x16c2-minmax-neon-mlal-padal-dup.c
+  src/qs8-igemm/gen/3x16c2-minmax-neon-mull-padal-dup.c
+  src/qs8-igemm/gen/3x16c8-minmax-neon-mlal-padal.c
+  src/qs8-igemm/gen/3x16c8-minmax-neon-mull-padal.c
+  src/qs8-igemm/gen/3x16c16-minmax-neon-mlal-padal.c
+  src/qs8-igemm/gen/4x8-minmax-neon-mlal-lane.c
+  src/qs8-igemm/gen/4x8-minmax-neon-mull-addw-dup.c
+  src/qs8-igemm/gen/4x8c2-minmax-neon-mlal-padal-dup.c
+  src/qs8-igemm/gen/4x8c2-minmax-neon-mull-padal-dup.c
+  src/qs8-igemm/gen/4x8c8-minmax-neon-mlal-padal.c
+  src/qs8-igemm/gen/4x8c8-minmax-neon-mull-padal.c
+  src/qs8-igemm/gen/4x8c16-minmax-neon-mlal-padal.c
+  src/qs8-igemm/gen/4x16-minmax-neon-mlal-lane.c
+  src/qs8-igemm/gen/4x16-minmax-neon-mull-addw-dup.c
+  src/qs8-igemm/gen/4x16c2-minmax-neon-mlal-padal-dup.c
+  src/qs8-igemm/gen/4x16c2-minmax-neon-mull-padal-dup.c
+  src/qs8-igemm/gen/4x16c8-minmax-neon-mlal-padal.c
+  src/qs8-igemm/gen/4x16c8-minmax-neon-mull-padal.c
+  src/qs8-igemm/gen/4x16c16-minmax-neon-mlal-padal.c
   src/qs8-requantization/fp32-neon.c
   src/qs8-requantization/precise-neon.c
   src/qs8-requantization/q31-neon.c
@@ -1024,6 +1126,8 @@
   src/f32-gemm/gen/6x8-minmax-neonfma-dup-ld128.c
   src/f32-gemm/gen/6x8s4-minmax-neonfma.c
   src/f32-gemm/gen/8x8s4-minmax-neonfma.c
+  src/f32-ibilinear-chw/gen/neonfma-p4.c
+  src/f32-ibilinear-chw/gen/neonfma-p8.c
   src/f32-ibilinear/gen/neonfma-c4.c
   src/f32-ibilinear/gen/neonfma-c8.c
   src/f32-igemm/gen/1x8-minmax-neonfma-dup-ld64.c
@@ -1416,7 +1520,6 @@
   src/qs8-gemm/gen/6x16c4-minmax-neondot.c
   src/qs8-gemm/gen/8x8c4-minmax-neondot.c
   src/qs8-gemm/gen/8x16c4-minmax-neondot.c
-  src/qs8-gemm/gen/12x8c4-minmax-neondot.c
   src/qs8-igemm/gen/1x8c4-minmax-neondot.c
   src/qs8-igemm/gen/1x16c4-minmax-neondot.c
   src/qs8-igemm/gen/4x8c4-minmax-neondot.c
@@ -1424,8 +1527,7 @@
   src/qs8-igemm/gen/6x8c4-minmax-neondot.c
   src/qs8-igemm/gen/6x16c4-minmax-neondot.c
   src/qs8-igemm/gen/8x8c4-minmax-neondot.c
-  src/qs8-igemm/gen/8x16c4-minmax-neondot.c
-  src/qs8-igemm/gen/12x8c4-minmax-neondot.c)
+  src/qs8-igemm/gen/8x16c4-minmax-neondot.c)
 
 SET(XNNPACK_SSE_MICROKERNEL_SRCS
   src/f32-avgpool/9p8x-minmax-sse-c4.c
@@ -2686,9 +2788,18 @@
   src/f32-igemm/gen/5x8-minmax-aarch64-neonfma-cortex-a75.S
   src/f32-igemm/gen/6x8-minmax-aarch64-neonfma-cortex-a57.S
   src/f32-igemm/gen/6x8-minmax-aarch64-neonfma-cortex-a75.S
+  src/qs8-gemm/1x16c4-aarch64-neondot-ld32.S
   src/qs8-gemm/1x16c4-aarch64-neondot-ld64.S
+  src/qs8-gemm/2x8c8-aarch64-neon-mull-padal.S
+  src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal.S
+  src/qs8-gemm/2x8c16-aarch64-neon-mlal-padal.S
   src/qs8-gemm/4x16c4-aarch64-neondot-cortex-a55.S
-  src/qs8-gemm/4x16c4-aarch64-neondot-ld64.S)
+  src/qs8-gemm/4x16c4-aarch64-neondot-ld32.S
+  src/qs8-gemm/4x16c4-aarch64-neondot-ld64.S
+  src/qs8-igemm/2x8c8-aarch64-neon-mlal-padal.S
+  src/qs8-igemm/2x8c16-aarch64-neon-mlal-padal.S
+  src/qs8-igemm/4x16c4-aarch64-neondot-cortex-a55.S
+  src/qs8-igemm/4x16c4-aarch64-neondot-ld64.S)
 
 SET(XNNPACK_MICROKERNEL_SRCS ${XNNPACK_SCALAR_MICROKERNEL_SRCS})
 IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^armv[5-8]" OR IOS_ARCH MATCHES "^armv7")
@@ -4627,12 +4738,17 @@
     models/fp32-mobilenet-v1.cc
     models/qs8-mobilenet-v1.cc
     models/qs8-mobilenet-v2.cc
+    models/qu8-mobilenet-v1.cc
     models/fp16-mobilenet-v2.cc
     models/fp32-mobilenet-v2.cc
     models/fp16-mobilenet-v3-large.cc
     models/fp32-mobilenet-v3-large.cc
     models/fp16-mobilenet-v3-small.cc
-    models/fp32-mobilenet-v3-small.cc)
+    models/fp32-mobilenet-v3-small.cc
+    models/fp32-sparse-mobilenet-v1.cc
+    models/fp32-sparse-mobilenet-v2.cc
+    models/fp32-sparse-mobilenet-v3-large.cc
+    models/fp32-sparse-mobilenet-v3-small.cc)
   SET_TARGET_PROPERTIES(bench-models PROPERTIES
     CXX_STANDARD 11
     CXX_STANDARD_REQUIRED YES
@@ -4664,6 +4780,14 @@
   TARGET_INCLUDE_DIRECTORIES(f32-gemm-e2e-bench PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}" src)
   TARGET_LINK_LIBRARIES(f32-gemm-e2e-bench PRIVATE XNNPACK benchmark bench-models bench-utils)
 
+  ADD_EXECUTABLE(qs8-gemm-e2e-bench bench/qs8-gemm-e2e.cc)
+  SET_TARGET_PROPERTIES(qs8-gemm-e2e-bench PROPERTIES
+    CXX_STANDARD 11
+    CXX_STANDARD_REQUIRED YES
+    CXX_EXTENSIONS NO)
+  TARGET_INCLUDE_DIRECTORIES(qs8-gemm-e2e-bench PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}" src)
+  TARGET_LINK_LIBRARIES(qs8-gemm-e2e-bench PRIVATE XNNPACK benchmark bench-models bench-utils)
+
   # ---[ Build operator-level microbenchmarks
   ADD_EXECUTABLE(average-pooling-bench bench/average-pooling.cc)
   SET_TARGET_PROPERTIES(average-pooling-bench PROPERTIES
diff --git a/METADATA b/METADATA
index 804fead..05d2915 100644
--- a/METADATA
+++ b/METADATA
@@ -9,11 +9,11 @@
     type: GIT
     value: "https://github.com/google/XNNPACK"
   }
-  version: "db2475b8af7e129b722167c3579ea743735fd733"
+  version: "b0da47a9fab5216f120c5abc3aa44a61b2dab932"
   license_type: NOTICE
   last_upgrade_date {
     year: 2021
-    month: 1
-    day: 5
+    month: 3
+    day: 19
   }
 }
diff --git a/WORKSPACE b/WORKSPACE
index a901453..c772982 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -45,9 +45,9 @@
 # pthreadpool library, used for parallelization
 http_archive(
     name = "pthreadpool",
-    strip_prefix = "pthreadpool-545ebe9f225aec6dca49109516fac02e973a3de2",
-    sha256 = "8461f6540ae9f777ce20d1c0d1d249e5e61c438744fb390c0c6f91940aa69ea3",
-    urls = ["https://github.com/Maratyszcza/pthreadpool/archive/545ebe9f225aec6dca49109516fac02e973a3de2.zip"],
+    strip_prefix = "pthreadpool-b8374f80e42010941bda6c85b0e3f1a1bd77a1e0",
+    sha256 = "b96413b10dd8edaa4f6c0a60c6cf5ef55eebeef78164d5d69294c8173457f0ec",
+    urls = ["https://github.com/Maratyszcza/pthreadpool/archive/b8374f80e42010941bda6c85b0e3f1a1bd77a1e0.zip"],
 )
 
 # clog library, used for logging
diff --git a/bench/end2end.cc b/bench/end2end.cc
index 52c5436..629e4c6 100644
--- a/bench/end2end.cc
+++ b/bench/end2end.cc
@@ -68,6 +68,30 @@
   End2EndBenchmark(state, models::FP32MobileNetV3Small);
 }
 
+static void FP32Sparse80MobileNetV1(benchmark::State& state) {
+  End2EndBenchmark(state, [](pthreadpool_t threadpool) {
+    return models::FP32SparseMobileNetV1(0.8f, threadpool);
+  });
+}
+
+static void FP32Sparse80MobileNetV2(benchmark::State& state) {
+  End2EndBenchmark(state, [](pthreadpool_t threadpool) {
+    return models::FP32SparseMobileNetV2(0.8f, threadpool);
+  });
+}
+
+static void FP32Sparse80MobileNetV3Large(benchmark::State& state) {
+  End2EndBenchmark(state, [](pthreadpool_t threadpool) {
+    return models::FP32SparseMobileNetV3Large(0.8f, threadpool);
+  });
+}
+
+static void FP32Sparse80MobileNetV3Small(benchmark::State& state) {
+  End2EndBenchmark(state, [](pthreadpool_t threadpool) {
+    return models::FP32SparseMobileNetV3Small(0.8f, threadpool);
+  });
+}
+
 static void FP16MobileNetV1(benchmark::State& state) {
   End2EndBenchmark(state, models::FP16MobileNetV1);
 }
@@ -92,11 +116,20 @@
   End2EndBenchmark(state, models::QS8MobileNetV2);
 }
 
+static void QU8MobileNetV1(benchmark::State& state) {
+  End2EndBenchmark(state, models::QU8MobileNetV1);
+}
+
 BENCHMARK(FP32MobileNetV1)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
 BENCHMARK(FP32MobileNetV2)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
 BENCHMARK(FP32MobileNetV3Large)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
 BENCHMARK(FP32MobileNetV3Small)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
 
+BENCHMARK(FP32Sparse80MobileNetV1)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
+BENCHMARK(FP32Sparse80MobileNetV2)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
+BENCHMARK(FP32Sparse80MobileNetV3Large)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
+BENCHMARK(FP32Sparse80MobileNetV3Small)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
+
 BENCHMARK(FP16MobileNetV1)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
 BENCHMARK(FP16MobileNetV2)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
 BENCHMARK(FP16MobileNetV3Large)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
@@ -105,6 +138,8 @@
 BENCHMARK(QS8MobileNetV1)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
 BENCHMARK(QS8MobileNetV2)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
 
+BENCHMARK(QU8MobileNetV1)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
+
 #ifndef XNNPACK_BENCHMARK_NO_MAIN
 BENCHMARK_MAIN();
 #endif
\ No newline at end of file
diff --git a/bench/end2end.h b/bench/end2end.h
index a574867..2b87f0e 100644
--- a/bench/end2end.h
+++ b/bench/end2end.h
@@ -14,3 +14,7 @@
   BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v2, models::FP32MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime(); \
   BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v3_large, models::FP32MobileNetV3Large)->Unit(benchmark::kMicrosecond)->UseRealTime(); \
   BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v3_small, models::FP32MobileNetV3Small)->Unit(benchmark::kMicrosecond)->UseRealTime();
+
+#define BENCHMARK_QS8_END2END(benchmark_fn) \
+  BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v1, models::QS8MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime(); \
+  BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v2, models::QS8MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime();
diff --git a/bench/f32-dwconv-e2e.cc b/bench/f32-dwconv-e2e.cc
index 007fde2..abe1431 100644
--- a/bench/f32-dwconv-e2e.cc
+++ b/bench/f32-dwconv-e2e.cc
@@ -233,25 +233,25 @@
       32 /* cr */, 9 /* mr */, benchmark::utils::CheckAVX512F);
   }
 
-  BENCHMARK_FP32_END2END(f32_dwconv_up4x9__sse);
-  BENCHMARK_FP32_END2END(f32_dwconv_up4x9__sse_acc2);
-  BENCHMARK_FP32_END2END(f32_dwconv_up8x9__sse);
-  BENCHMARK_FP32_END2END(f32_dwconv_up8x9__sse_acc2);
-
-  BENCHMARK_FP32_END2END(f32_dwconv_up8x9__avx);
-  BENCHMARK_FP32_END2END(f32_dwconv_up8x9__avx_acc2);
-  BENCHMARK_FP32_END2END(f32_dwconv_up16x9__avx);
-  BENCHMARK_FP32_END2END(f32_dwconv_up16x9__avx_acc2);
+  BENCHMARK_FP32_END2END(f32_dwconv_up16x9__avx512f);
+  BENCHMARK_FP32_END2END(f32_dwconv_up16x9__avx512f_acc2);
+  BENCHMARK_FP32_END2END(f32_dwconv_up32x9__avx512f);
+  BENCHMARK_FP32_END2END(f32_dwconv_up32x9__avx512f_acc2);
 
   BENCHMARK_FP32_END2END(f32_dwconv_up8x9__fma3);
   BENCHMARK_FP32_END2END(f32_dwconv_up8x9__fma3_acc2);
   BENCHMARK_FP32_END2END(f32_dwconv_up16x9__fma3);
   BENCHMARK_FP32_END2END(f32_dwconv_up16x9__fma3_acc2);
 
-  BENCHMARK_FP32_END2END(f32_dwconv_up16x9__avx512f);
-  BENCHMARK_FP32_END2END(f32_dwconv_up16x9__avx512f_acc2);
-  BENCHMARK_FP32_END2END(f32_dwconv_up32x9__avx512f);
-  BENCHMARK_FP32_END2END(f32_dwconv_up32x9__avx512f_acc2);
+  BENCHMARK_FP32_END2END(f32_dwconv_up8x9__avx);
+  BENCHMARK_FP32_END2END(f32_dwconv_up8x9__avx_acc2);
+  BENCHMARK_FP32_END2END(f32_dwconv_up16x9__avx);
+  BENCHMARK_FP32_END2END(f32_dwconv_up16x9__avx_acc2);
+
+  BENCHMARK_FP32_END2END(f32_dwconv_up4x9__sse);
+  BENCHMARK_FP32_END2END(f32_dwconv_up4x9__sse_acc2);
+  BENCHMARK_FP32_END2END(f32_dwconv_up8x9__sse);
+  BENCHMARK_FP32_END2END(f32_dwconv_up8x9__sse_acc2);
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 #if XNN_ARCH_WASMSIMD
diff --git a/bench/f32-gemm-e2e.cc b/bench/f32-gemm-e2e.cc
index 281b4d1..074c545 100644
--- a/bench/f32-gemm-e2e.cc
+++ b/bench/f32-gemm-e2e.cc
@@ -886,6 +886,37 @@
       benchmark::utils::CheckAVX512F);
   }
 
+  BENCHMARK_FP32_END2END(f32_gemm_4x16__avx512f_broadcast);
+  BENCHMARK_FP32_END2END(f32_gemm_5x16__avx512f_broadcast);
+  BENCHMARK_FP32_END2END(f32_gemm_6x16__avx512f_broadcast);
+  BENCHMARK_FP32_END2END(f32_gemm_7x16__avx512f_broadcast);
+  BENCHMARK_FP32_END2END(f32_gemm_8x16__avx512f_broadcast);
+
+  BENCHMARK_FP32_END2END(f32_gemm_3x16s4__fma3_broadcast);
+  BENCHMARK_FP32_END2END(f32_gemm_4x16s4__fma3_broadcast);
+  BENCHMARK_FP32_END2END(f32_gemm_5x16s4__fma3_broadcast);
+
+  BENCHMARK_FP32_END2END(f32_gemm_4x8__fma3_broadcast);
+  BENCHMARK_FP32_END2END(f32_gemm_5x8__fma3_broadcast);
+  BENCHMARK_FP32_END2END(f32_gemm_6x8__fma3_broadcast);
+  BENCHMARK_FP32_END2END(f32_gemm_7x8__fma3_broadcast);
+  BENCHMARK_FP32_END2END(f32_gemm_8x8__fma3_broadcast);
+  BENCHMARK_FP32_END2END(f32_gemm_3x16__fma3_broadcast);
+  BENCHMARK_FP32_END2END(f32_gemm_4x16__fma3_broadcast);
+  BENCHMARK_FP32_END2END(f32_gemm_5x16__fma3_broadcast);
+
+  BENCHMARK_FP32_END2END(f32_gemm_4x8__avx_broadcast);
+  BENCHMARK_FP32_END2END(f32_gemm_5x8__avx_broadcast);
+  BENCHMARK_FP32_END2END(f32_gemm_6x8__avx_broadcast);
+  BENCHMARK_FP32_END2END(f32_gemm_7x8__avx_broadcast);
+  BENCHMARK_FP32_END2END(f32_gemm_3x16__avx_broadcast);
+  BENCHMARK_FP32_END2END(f32_gemm_4x16__avx_broadcast);
+  BENCHMARK_FP32_END2END(f32_gemm_5x16__avx_broadcast);
+
+  BENCHMARK_FP32_END2END(f32_gemm_3x8__sse2_dup);
+  BENCHMARK_FP32_END2END(f32_gemm_4x8__sse2_dup);
+  BENCHMARK_FP32_END2END(f32_gemm_5x8__sse2_dup);
+
   BENCHMARK_FP32_END2END(f32_gemm_3x8__sse_load1);
   BENCHMARK_FP32_END2END(f32_gemm_4x8__sse_load1);
   BENCHMARK_FP32_END2END(f32_gemm_5x8__sse_load1);
@@ -897,37 +928,6 @@
   BENCHMARK_FP32_END2END(f32_gemm_3x8s4__sse);
   BENCHMARK_FP32_END2END(f32_gemm_4x8s4__sse);
   BENCHMARK_FP32_END2END(f32_gemm_5x8s4__sse);
-
-  BENCHMARK_FP32_END2END(f32_gemm_3x8__sse2_dup);
-  BENCHMARK_FP32_END2END(f32_gemm_4x8__sse2_dup);
-  BENCHMARK_FP32_END2END(f32_gemm_5x8__sse2_dup);
-
-  BENCHMARK_FP32_END2END(f32_gemm_4x8__avx_broadcast);
-  BENCHMARK_FP32_END2END(f32_gemm_5x8__avx_broadcast);
-  BENCHMARK_FP32_END2END(f32_gemm_6x8__avx_broadcast);
-  BENCHMARK_FP32_END2END(f32_gemm_7x8__avx_broadcast);
-  BENCHMARK_FP32_END2END(f32_gemm_3x16__avx_broadcast);
-  BENCHMARK_FP32_END2END(f32_gemm_4x16__avx_broadcast);
-  BENCHMARK_FP32_END2END(f32_gemm_5x16__avx_broadcast);
-
-  BENCHMARK_FP32_END2END(f32_gemm_4x8__fma3_broadcast);
-  BENCHMARK_FP32_END2END(f32_gemm_5x8__fma3_broadcast);
-  BENCHMARK_FP32_END2END(f32_gemm_6x8__fma3_broadcast);
-  BENCHMARK_FP32_END2END(f32_gemm_7x8__fma3_broadcast);
-  BENCHMARK_FP32_END2END(f32_gemm_8x8__fma3_broadcast);
-  BENCHMARK_FP32_END2END(f32_gemm_3x16__fma3_broadcast);
-  BENCHMARK_FP32_END2END(f32_gemm_4x16__fma3_broadcast);
-  BENCHMARK_FP32_END2END(f32_gemm_5x16__fma3_broadcast);
-
-  BENCHMARK_FP32_END2END(f32_gemm_3x16s4__fma3_broadcast);
-  BENCHMARK_FP32_END2END(f32_gemm_4x16s4__fma3_broadcast);
-  BENCHMARK_FP32_END2END(f32_gemm_5x16s4__fma3_broadcast);
-
-  BENCHMARK_FP32_END2END(f32_gemm_4x16__avx512f_broadcast);
-  BENCHMARK_FP32_END2END(f32_gemm_5x16__avx512f_broadcast);
-  BENCHMARK_FP32_END2END(f32_gemm_6x16__avx512f_broadcast);
-  BENCHMARK_FP32_END2END(f32_gemm_7x16__avx512f_broadcast);
-  BENCHMARK_FP32_END2END(f32_gemm_8x16__avx512f_broadcast);
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 #if XNN_ARCH_WASMSIMD
diff --git a/bench/qs8-gemm-e2e.cc b/bench/qs8-gemm-e2e.cc
new file mode 100644
index 0000000..99858e1
--- /dev/null
+++ b/bench/qs8-gemm-e2e.cc
@@ -0,0 +1,1511 @@
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <algorithm>
+#include <cmath>
+#include <functional>
+#include <random>
+#include <vector>
+
+#include <xnnpack.h>
+
+#include <benchmark/benchmark.h>
+
+#include "bench/end2end.h"
+#include "bench/utils.h"
+#include "models/models.h"
+#include <xnnpack/gemm.h>
+#include <xnnpack/igemm.h>
+#include <xnnpack/params.h>
+
+
+// define XNN_ENABLE_FULL_BENCHMARKS=1 to enable all microkernel benchmarks.
+
+static void GEMMEnd2EndBenchmark(
+  benchmark::State& state,
+  models::ExecutionPlanFactory model_factory,
+  xnn_qs8_gemm_ukernel_function gemm,
+  xnn_qs8_igemm_ukernel_function igemm,
+  xnn_qs8_gemm_ukernel_function gemm1,
+  xnn_qs8_igemm_ukernel_function igemm1,
+  uint8_t mr, uint8_t nr, uint8_t log2_kr = 0, uint8_t log2_sr = 0,
+  benchmark::utils::IsaCheckFunction isa_check = nullptr)
+{
+  if (isa_check && !isa_check(state)) {
+    return;
+  }
+  if (xnn_initialize(nullptr /* allocator */) != xnn_status_success) {
+    state.SkipWithError("failed to initialize XNNPACK");
+    return;
+  }
+
+  // Override microkernels chosen in xnn_initialize
+  // Note: do not directly assign to xnn_params.qs8.gemm because it breaks older gcc.
+  xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_function(gemm));
+  xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_function(igemm));
+  xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_function(gemm1));
+  xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_function(igemm1));
+  xnn_params.qs8.gemm.mr = mr;
+  xnn_params.qs8.gemm.nr = nr;
+  xnn_params.qs8.gemm.log2_kr = log2_kr;
+  xnn_params.qs8.gemm.log2_sr = log2_sr;
+
+  auto execution_plan = model_factory(nullptr);
+  if (execution_plan.empty()) {
+    state.SkipWithError("failed to create a model");
+    return;
+  }
+
+  for (auto _ : state) {
+    for (const std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)>& op : execution_plan) {
+      xnn_status status = xnn_run_operator(op.get(), nullptr);
+      if (status != xnn_status_success) {
+        state.SkipWithError("failed to run a model");
+        return;
+      }
+    }
+  }
+
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+}
+
+#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
+#if XNN_ENABLE_FULL_BENCHMARKS
+  static void qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld32(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld32,
+      xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot,
+      xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld32,
+      xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot,
+      1 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEONDOT);
+  }
+  static void qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld64,
+      xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot,
+      xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld64,
+      xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot,
+      1 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEONDOT);
+  }
+#endif  // XNN_ENABLE_FULL_BENCHMARKS
+
+  static void qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55,
+      xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55,
+      xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld64,
+      xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot,
+      4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEONDOT);
+  }
+  static void qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld32(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld32,
+      xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot,
+      xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld32,
+      xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot,
+      4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEONDOT);
+  }
+  static void qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64,
+      xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64,
+      xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld64,
+      xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot,
+      4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEONDOT);
+  }
+
+  static void qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mull_padal(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mull_padal,
+      xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mull_padal,
+      xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal,
+      xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal,
+      2 /* mr */, 8  /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal,
+      xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal,
+      xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal,
+      xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal,
+      2 /* mr */, 8  /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal,
+      xnn_qs8_igemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal,
+      xnn_qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal,
+      xnn_qs8_igemm_minmax_ukernel_1x8c16__neon_mlal_padal,
+      2 /* mr */, 8  /* nr */, 4 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+#if XNN_ENABLE_FULL_BENCHMARKS
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld32)
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld64)
+#endif  // XNN_ENABLE_FULL_BENCHMARKS
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55)
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld32)
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64)
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mull_padal)
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal)
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal)
+#endif  // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+#if XNN_ENABLE_FULL_BENCHMARKS
+  static void qs8_gemm_minmax_ukernel_1x8__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_1x8__neon_mlal_lane,
+      xnn_qs8_igemm_minmax_ukernel_1x8__neon_mlal_lane,
+      xnn_qs8_gemm_minmax_ukernel_1x8__neon_mlal_lane,
+      xnn_qs8_igemm_minmax_ukernel_1x8__neon_mlal_lane,
+      1 /* mr */, 8  /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane,
+      xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane,
+      xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane,
+      xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane,
+      1 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+#endif  // XNN_ENABLE_FULL_BENCHMARKS
+
+  static void qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane,
+      xnn_qs8_igemm_minmax_ukernel_2x8__neon_mlal_lane,
+      xnn_qs8_gemm_minmax_ukernel_1x8__neon_mlal_lane,
+      xnn_qs8_igemm_minmax_ukernel_1x8__neon_mlal_lane,
+      2 /* mr */, 8  /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane,
+      xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane,
+      xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane,
+      xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane,
+      2 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane,
+      xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane,
+      xnn_qs8_gemm_minmax_ukernel_1x8__neon_mlal_lane,
+      xnn_qs8_igemm_minmax_ukernel_1x8__neon_mlal_lane,
+      3 /* mr */, 8  /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane,
+      xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane,
+      xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane,
+      xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane,
+      3 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane,
+      xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane,
+      xnn_qs8_gemm_minmax_ukernel_1x8__neon_mlal_lane,
+      xnn_qs8_igemm_minmax_ukernel_1x8__neon_mlal_lane,
+      4 /* mr */, 8  /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane,
+      xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane,
+      xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane,
+      xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane,
+      4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+#if XNN_ENABLE_FULL_BENCHMARKS
+  static void qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup,
+      xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup,
+      xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup,
+      xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup,
+      1 /* mr */, 8  /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup,
+      xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup,
+      xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup,
+      xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup,
+      1 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+#endif  // XNN_ENABLE_FULL_BENCHMARKS
+
+  static void qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup,
+      xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup,
+      xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup,
+      xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup,
+      2 /* mr */, 8  /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup,
+      xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup,
+      xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup,
+      xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup,
+      2 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup,
+      xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup,
+      xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup,
+      xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup,
+      3 /* mr */, 8  /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup,
+      xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup,
+      xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup,
+      xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup,
+      3 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup,
+      xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup,
+      xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup,
+      xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup,
+      4 /* mr */, 8  /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup,
+      xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup,
+      xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup,
+      xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup,
+      4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+#if XNN_ENABLE_FULL_BENCHMARKS
+  static void qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup,
+      xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup,
+      xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup,
+      xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup,
+      1 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup,
+      xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup,
+      xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup,
+      xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup,
+      1 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+#endif  // XNN_ENABLE_FULL_BENCHMARKS
+
+  static void qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup,
+      xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup,
+      xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup,
+      xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup,
+      2 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup,
+      xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup,
+      xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup,
+      xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup,
+      2 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup,
+      xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup,
+      xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup,
+      xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup,
+      3 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup,
+      xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup,
+      xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup,
+      xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup,
+      3 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup,
+      xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup,
+      xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup,
+      xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup,
+      4 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup,
+      xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup,
+      xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup,
+      xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup,
+      4 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+#if XNN_ENABLE_FULL_BENCHMARKS
+  static void qs8_gemm_minmax_ukernel_1x8c2__neon_mull_padal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mull_padal_dup,
+      xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup,
+      xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mull_padal_dup,
+      xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup,
+      1 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup,
+      xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup,
+      xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup,
+      xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup,
+      1 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+#endif  // XNN_ENABLE_FULL_BENCHMARKS
+
+  static void qs8_gemm_minmax_ukernel_2x8c2__neon_mull_padal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mull_padal_dup,
+      xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup,
+      xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mull_padal_dup,
+      xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup,
+      2 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup,
+      xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup,
+      xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup,
+      xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup,
+      2 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup,
+      xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup,
+      xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mull_padal_dup,
+      xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup,
+      3 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup,
+      xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup,
+      xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup,
+      xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup,
+      3 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_minmax_ukernel_4x8c2__neon_mull_padal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mull_padal_dup,
+      xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup,
+      xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mull_padal_dup,
+      xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup,
+      4 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup,
+      xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup,
+      xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup,
+      xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup,
+      4 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+#if XNN_ENABLE_FULL_BENCHMARKS
+  static void qs8_gemm_minmax_ukernel_1x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_1x8c4__neondot,
+      xnn_qs8_igemm_minmax_ukernel_1x8c4__neondot,
+      xnn_qs8_gemm_minmax_ukernel_1x8c4__neondot,
+      xnn_qs8_igemm_minmax_ukernel_1x8c4__neondot,
+      1 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEONDOT);
+  }
+
+  static void qs8_gemm_minmax_ukernel_1x16c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot,
+      xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot,
+      xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot,
+      xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot,
+      1 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEONDOT);
+  }
+#endif  // XNN_ENABLE_FULL_BENCHMARKS
+
+  static void qs8_gemm_minmax_ukernel_4x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_4x8c4__neondot,
+      xnn_qs8_igemm_minmax_ukernel_4x8c4__neondot,
+      xnn_qs8_gemm_minmax_ukernel_1x8c4__neondot,
+      xnn_qs8_igemm_minmax_ukernel_1x8c4__neondot,
+      4 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEONDOT);
+  }
+
+  static void qs8_gemm_minmax_ukernel_4x16c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot,
+      xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot,
+      xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot,
+      xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot,
+      4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEONDOT);
+  }
+
+  static void qs8_gemm_minmax_ukernel_6x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_6x8c4__neondot,
+      xnn_qs8_igemm_minmax_ukernel_6x8c4__neondot,
+      xnn_qs8_gemm_minmax_ukernel_1x8c4__neondot,
+      xnn_qs8_igemm_minmax_ukernel_1x8c4__neondot,
+      6 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEONDOT);
+  }
+
+  static void qs8_gemm_minmax_ukernel_6x16c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot,
+      xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot,
+      xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot,
+      xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot,
+      6 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEONDOT);
+  }
+
+  static void qs8_gemm_minmax_ukernel_8x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_8x8c4__neondot,
+      xnn_qs8_igemm_minmax_ukernel_8x8c4__neondot,
+      xnn_qs8_gemm_minmax_ukernel_1x8c4__neondot,
+      xnn_qs8_igemm_minmax_ukernel_1x8c4__neondot,
+      8 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEONDOT);
+  }
+
+  static void qs8_gemm_minmax_ukernel_8x16c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot,
+      xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot,
+      xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot,
+      xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot,
+      8 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEONDOT);
+  }
+
+#if XNN_ENABLE_FULL_BENCHMARKS
+  static void qs8_gemm_minmax_ukernel_1x8c8__neon_mull_padal(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mull_padal,
+      xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mull_padal,
+      xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mull_padal,
+      xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mull_padal,
+      1 /* mr */, 8  /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal,
+      xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal,
+      xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal,
+      xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal,
+      1 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+#endif  // XNN_ENABLE_FULL_BENCHMARKS
+
+  static void qs8_gemm_minmax_ukernel_2x8c8__neon_mull_padal(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mull_padal,
+      xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mull_padal,
+      xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mull_padal,
+      xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mull_padal,
+      2 /* mr */, 8  /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal,
+      xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal,
+      xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal,
+      xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal,
+      2 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal,
+      xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal,
+      xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mull_padal,
+      xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mull_padal,
+      3 /* mr */, 8  /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal,
+      xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal,
+      xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal,
+      xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal,
+      3 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal,
+      xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal,
+      xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mull_padal,
+      xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mull_padal,
+      4 /* mr */, 8  /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal,
+      xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal,
+      xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal,
+      xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal,
+      4 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+#if XNN_ENABLE_FULL_BENCHMARKS
+  static void qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal,
+      xnn_qs8_igemm_minmax_ukernel_1x8c16__neon_mlal_padal,
+      xnn_qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal,
+      xnn_qs8_igemm_minmax_ukernel_1x8c16__neon_mlal_padal,
+      1 /* mr */, 8  /* nr */, 4 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal,
+      xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal,
+      xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal,
+      xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal,
+      1 /* mr */, 16 /* nr */, 4 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+#endif  // XNN_ENABLE_FULL_BENCHMARKS
+
+  static void qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal,
+      xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal,
+      xnn_qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal,
+      xnn_qs8_igemm_minmax_ukernel_1x8c16__neon_mlal_padal,
+      2 /* mr */, 8  /* nr */, 4 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal,
+      xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal,
+      xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal,
+      xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal,
+      2 /* mr */, 16 /* nr */, 4 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal,
+      xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal,
+      xnn_qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal,
+      xnn_qs8_igemm_minmax_ukernel_1x8c16__neon_mlal_padal,
+      4 /* mr */, 8  /* nr */, 4 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal,
+      xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal,
+      xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal,
+      xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal,
+      4 /* mr */, 16 /* nr */, 4 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal,
+      xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal,
+      xnn_qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal,
+      xnn_qs8_igemm_minmax_ukernel_1x8c16__neon_mlal_padal,
+      4 /* mr */, 8  /* nr */, 4 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal,
+      xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal,
+      xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal,
+      xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal,
+      4 /* mr */, 16 /* nr */, 4 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+#if XNN_ENABLE_FULL_BENCHMARKS
+  static void qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal,
+      xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal,
+      xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal,
+      xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal,
+      1 /* mr */, 8  /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal,
+      xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal,
+      xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal,
+      xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal,
+      1 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+#endif  // XNN_ENABLE_FULL_BENCHMARKS
+
+  static void qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal,
+      xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal,
+      xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal,
+      xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal,
+      2 /* mr */, 8  /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal,
+      xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal,
+      xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal,
+      xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal,
+      2 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal,
+      xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal,
+      xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal,
+      xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal,
+      3 /* mr */, 8  /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal,
+      xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal,
+      xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal,
+      xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal,
+      3 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal,
+      xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal,
+      xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal,
+      xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal,
+      4 /* mr */, 8  /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal,
+      xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal,
+      xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal,
+      xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal,
+      4 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+#if XNN_ENABLE_FULL_BENCHMARKS
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_1x8c4__neondot);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_1x16c4__neondot);
+#endif  // XNN_ENABLE_FULL_BENCHMARKS
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_4x8c4__neondot);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_4x16c4__neondot);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_6x8c4__neondot);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_6x16c4__neondot);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_8x8c4__neondot);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_8x16c4__neondot);
+
+#if XNN_ENABLE_FULL_BENCHMARKS
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+#endif  // XNN_ENABLE_FULL_BENCHMARKS
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+
+#if XNN_ENABLE_FULL_BENCHMARKS
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_1x8c8__neon_mull_padal);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal);
+#endif  // XNN_ENABLE_FULL_BENCHMARKS
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_2x8c8__neon_mull_padal);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal);
+
+#if XNN_ENABLE_FULL_BENCHMARKS
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+#endif  // XNN_ENABLE_FULL_BENCHMARKS
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+
+#if XNN_ENABLE_FULL_BENCHMARKS
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+#endif  // XNN_ENABLE_FULL_BENCHMARKS
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+
+#if XNN_ENABLE_FULL_BENCHMARKS
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+#endif  // XNN_ENABLE_FULL_BENCHMARKS
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+
+#if XNN_ENABLE_FULL_BENCHMARKS
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_1x8__neon_mlal_lane);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane);
+#endif  // XNN_ENABLE_FULL_BENCHMARKS
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane);
+
+#if XNN_ENABLE_FULL_BENCHMARKS
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+#endif  // XNN_ENABLE_FULL_BENCHMARKS
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ENABLE_FULL_BENCHMARKS
+  static void qs8_gemm_minmax_ukernel_1x4c2__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_1x4c2__sse2_ld64,
+      xnn_qs8_igemm_minmax_ukernel_1x4c2__sse2_ld64,
+      xnn_qs8_gemm_minmax_ukernel_1x4c2__sse2_ld64,
+      xnn_qs8_igemm_minmax_ukernel_1x4c2__sse2_ld64,
+      1 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */);
+  }
+
+  static void qs8_gemm_minmax_ukernel_1x4c2__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_1x4c2__sse2_ld128,
+      xnn_qs8_igemm_minmax_ukernel_1x4c2__sse2_ld128,
+      xnn_qs8_gemm_minmax_ukernel_1x4c2__sse2_ld128,
+      xnn_qs8_igemm_minmax_ukernel_1x4c2__sse2_ld128,
+      1 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */);
+  }
+
+  static void qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64,
+      xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64,
+      xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64,
+      xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64,
+      1 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckSSE41);
+  }
+
+  static void qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128,
+      xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128,
+      xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128,
+      xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128,
+      1 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckSSE41);
+  }
+
+  static void qs8_gemm_minmax_ukernel_1x4c2__ssse3_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_1x4c2__ssse3_ld64,
+      xnn_qs8_igemm_minmax_ukernel_1x4c2__ssse3_ld64,
+      xnn_qs8_gemm_minmax_ukernel_1x4c2__ssse3_ld64,
+      xnn_qs8_igemm_minmax_ukernel_1x4c2__ssse3_ld64,
+      1 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckSSSE3);
+  }
+
+  static void qs8_gemm_minmax_ukernel_1x4c2__ssse3_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_1x4c2__ssse3_ld128,
+      xnn_qs8_igemm_minmax_ukernel_1x4c2__ssse3_ld128,
+      xnn_qs8_gemm_minmax_ukernel_1x4c2__ssse3_ld128,
+      xnn_qs8_igemm_minmax_ukernel_1x4c2__ssse3_ld128,
+      1 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckSSSE3);
+  }
+
+  static void qs8_gemm_minmax_ukernel_1x4c2__xop_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64,
+      xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64,
+      xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64,
+      xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64,
+      1 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckXOP);
+  }
+
+  static void qs8_gemm_minmax_ukernel_1x4c2__xop_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128,
+      xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128,
+      xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128,
+      xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128,
+      1 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckXOP);
+  }
+
+  static void qs8_gemm_minmax_ukernel_1x4c8__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_1x4c8__sse2_ld64,
+      xnn_qs8_igemm_minmax_ukernel_1x4c8__sse2_ld64,
+      xnn_qs8_gemm_minmax_ukernel_1x4c8__sse2_ld64,
+      xnn_qs8_igemm_minmax_ukernel_1x4c8__sse2_ld64,
+      1 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */);
+  }
+
+  static void qs8_gemm_minmax_ukernel_1x4c8__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_1x4c8__sse2_ld128,
+      xnn_qs8_igemm_minmax_ukernel_1x4c8__sse2_ld128,
+      xnn_qs8_gemm_minmax_ukernel_1x4c8__sse2_ld128,
+      xnn_qs8_igemm_minmax_ukernel_1x4c8__sse2_ld128,
+      1 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */);
+  }
+
+  static void qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64,
+      xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64,
+      xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64,
+      xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64,
+      1 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckSSE41);
+  }
+
+  static void qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128,
+      xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128,
+      xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128,
+      xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128,
+      1 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckSSE41);
+  }
+
+  static void qs8_gemm_minmax_ukernel_1x4c8__ssse3_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_1x4c8__ssse3_ld64,
+      xnn_qs8_igemm_minmax_ukernel_1x4c8__ssse3_ld64,
+      xnn_qs8_gemm_minmax_ukernel_1x4c8__ssse3_ld64,
+      xnn_qs8_igemm_minmax_ukernel_1x4c8__ssse3_ld64,
+      1 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckSSSE3);
+  }
+
+  static void qs8_gemm_minmax_ukernel_1x4c8__ssse3_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_1x4c8__ssse3_ld128,
+      xnn_qs8_igemm_minmax_ukernel_1x4c8__ssse3_ld128,
+      xnn_qs8_gemm_minmax_ukernel_1x4c8__ssse3_ld128,
+      xnn_qs8_igemm_minmax_ukernel_1x4c8__ssse3_ld128,
+      1 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckSSSE3);
+  }
+
+  static void qs8_gemm_minmax_ukernel_1x4c8__xop_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64,
+      xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64,
+      xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64,
+      xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64,
+      1 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckXOP);
+  }
+
+  static void qs8_gemm_minmax_ukernel_1x4c8__xop_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128,
+      xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128,
+      xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128,
+      xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128,
+      1 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckXOP);
+  }
+
+  static void qs8_gemm_minmax_ukernel_1x8c8__avx2(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2,
+      xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2,
+      xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2,
+      xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2,
+      1 /* mr */, 8 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckAVX2);
+  }
+
+  static void qs8_gemm_minmax_ukernel_1x16c8__avx512skx(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx,
+      xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx,
+      xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx,
+      xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx,
+      1 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckAVX512F);
+  }
+#endif  // XNN_ENABLE_FULL_BENCHMARKS
+
+  static void qs8_gemm_minmax_ukernel_2x4c8__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_2x4c8__sse2_ld64,
+      xnn_qs8_igemm_minmax_ukernel_2x4c8__sse2_ld64,
+      xnn_qs8_gemm_minmax_ukernel_1x4c8__sse2_ld64,
+      xnn_qs8_igemm_minmax_ukernel_1x4c8__sse2_ld64,
+      2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */);
+  }
+
+  static void qs8_gemm_minmax_ukernel_2x4c8__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_2x4c8__sse2_ld128,
+      xnn_qs8_igemm_minmax_ukernel_2x4c8__sse2_ld128,
+      xnn_qs8_gemm_minmax_ukernel_1x4c8__sse2_ld128,
+      xnn_qs8_igemm_minmax_ukernel_1x4c8__sse2_ld128,
+      2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */);
+  }
+
+  static void qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64,
+      xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64,
+      xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64,
+      xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64,
+      2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckSSE41);
+  }
+
+  static void qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128,
+      xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128,
+      xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128,
+      xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128,
+      2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckSSE41);
+  }
+
+  static void qs8_gemm_minmax_ukernel_2x4c8__ssse3_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_2x4c8__ssse3_ld64,
+      xnn_qs8_igemm_minmax_ukernel_2x4c8__ssse3_ld64,
+      xnn_qs8_gemm_minmax_ukernel_1x4c8__ssse3_ld64,
+      xnn_qs8_igemm_minmax_ukernel_1x4c8__ssse3_ld64,
+      2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckSSSE3);
+  }
+
+  static void qs8_gemm_minmax_ukernel_2x4c8__ssse3_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_2x4c8__ssse3_ld128,
+      xnn_qs8_igemm_minmax_ukernel_2x4c8__ssse3_ld128,
+      xnn_qs8_gemm_minmax_ukernel_1x4c8__ssse3_ld128,
+      xnn_qs8_igemm_minmax_ukernel_1x4c8__ssse3_ld128,
+      2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckSSSE3);
+  }
+
+  static void qs8_gemm_minmax_ukernel_2x4c8__xop_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64,
+      xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64,
+      xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64,
+      xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64,
+      2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckXOP);
+  }
+
+  static void qs8_gemm_minmax_ukernel_2x4c8__xop_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128,
+      xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128,
+      xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128,
+      xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128,
+      2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckXOP);
+  }
+
+  static void qs8_gemm_minmax_ukernel_2x8c8__avx2(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2,
+      xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2,
+      xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2,
+      xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2,
+      2 /* mr */, 8 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckAVX2);
+  }
+
+  static void qs8_gemm_minmax_ukernel_2x16c8__avx512skx(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx,
+      xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx,
+      xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx,
+      xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx,
+      2 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckAVX512F);
+  }
+
+  static void qs8_gemm_minmax_ukernel_3x4c8__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_3x4c8__sse2_ld64,
+      xnn_qs8_igemm_minmax_ukernel_3x4c8__sse2_ld64,
+      xnn_qs8_gemm_minmax_ukernel_1x4c8__sse2_ld64,
+      xnn_qs8_igemm_minmax_ukernel_1x4c8__sse2_ld64,
+      3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */);
+  }
+
+  static void qs8_gemm_minmax_ukernel_3x4c8__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_3x4c8__sse2_ld128,
+      xnn_qs8_igemm_minmax_ukernel_3x4c8__sse2_ld128,
+      xnn_qs8_gemm_minmax_ukernel_1x4c8__sse2_ld128,
+      xnn_qs8_igemm_minmax_ukernel_1x4c8__sse2_ld128,
+      3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */);
+  }
+
+  static void qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64,
+      xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64,
+      xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64,
+      xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64,
+      3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckSSE41);
+  }
+
+  static void qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128,
+      xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128,
+      xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128,
+      xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128,
+      3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckSSE41);
+  }
+
+  static void qs8_gemm_minmax_ukernel_3x4c8__ssse3_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_3x4c8__ssse3_ld64,
+      xnn_qs8_igemm_minmax_ukernel_3x4c8__ssse3_ld64,
+      xnn_qs8_gemm_minmax_ukernel_1x4c8__ssse3_ld64,
+      xnn_qs8_igemm_minmax_ukernel_1x4c8__ssse3_ld64,
+      3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckSSSE3);
+  }
+
+  static void qs8_gemm_minmax_ukernel_3x4c8__ssse3_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_3x4c8__ssse3_ld128,
+      xnn_qs8_igemm_minmax_ukernel_3x4c8__ssse3_ld128,
+      xnn_qs8_gemm_minmax_ukernel_1x4c8__ssse3_ld128,
+      xnn_qs8_igemm_minmax_ukernel_1x4c8__ssse3_ld128,
+      3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckSSSE3);
+  }
+
+  static void qs8_gemm_minmax_ukernel_3x4c8__xop_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64,
+      xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64,
+      xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64,
+      xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64,
+      3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckXOP);
+  }
+
+  static void qs8_gemm_minmax_ukernel_3x4c8__xop_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128,
+      xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128,
+      xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128,
+      xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128,
+      3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckXOP);
+  }
+
+  static void qs8_gemm_minmax_ukernel_3x8c8__avx2(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2,
+      xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2,
+      xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2,
+      xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2,
+      3 /* mr */, 8 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckAVX2);
+  }
+
+  static void qs8_gemm_minmax_ukernel_3x16c8__avx512skx(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx,
+      xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx,
+      xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx,
+      xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx,
+      3 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckAVX512F);
+  }
+
+  static void qs8_gemm_minmax_ukernel_4x4c2__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld64,
+      xnn_qs8_igemm_minmax_ukernel_4x4c2__sse2_ld64,
+      xnn_qs8_gemm_minmax_ukernel_1x4c2__sse2_ld64,
+      xnn_qs8_igemm_minmax_ukernel_1x4c2__sse2_ld64,
+      4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */);
+  }
+
+  static void qs8_gemm_minmax_ukernel_4x4c2__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld128,
+      xnn_qs8_igemm_minmax_ukernel_4x4c2__sse2_ld128,
+      xnn_qs8_gemm_minmax_ukernel_1x4c2__sse2_ld128,
+      xnn_qs8_igemm_minmax_ukernel_1x4c2__sse2_ld128,
+      4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */);
+  }
+
+  static void qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64,
+      xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64,
+      xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64,
+      xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64,
+      4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckSSE41);
+  }
+
+  static void qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128,
+      xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128,
+      xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128,
+      xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128,
+      4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckSSE41);
+  }
+
+  static void qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld64,
+      xnn_qs8_igemm_minmax_ukernel_4x4c2__ssse3_ld64,
+      xnn_qs8_gemm_minmax_ukernel_1x4c2__ssse3_ld64,
+      xnn_qs8_igemm_minmax_ukernel_1x4c2__ssse3_ld64,
+      4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckSSSE3);
+  }
+
+  static void qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld128,
+      xnn_qs8_igemm_minmax_ukernel_4x4c2__ssse3_ld128,
+      xnn_qs8_gemm_minmax_ukernel_1x4c2__ssse3_ld128,
+      xnn_qs8_igemm_minmax_ukernel_1x4c2__ssse3_ld128,
+      4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckSSSE3);
+  }
+
+  static void qs8_gemm_minmax_ukernel_4x4c2__xop_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64,
+      xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64,
+      xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64,
+      xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64,
+      4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckXOP);
+  }
+
+  static void qs8_gemm_minmax_ukernel_4x4c2__xop_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128,
+      xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128,
+      xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128,
+      xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128,
+      4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckXOP);
+  }
+
+  static void qs8_gemm_minmax_ukernel_4x16c8__avx512skx(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx,
+      xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx,
+      xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx,
+      xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx,
+      4 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckAVX512F);
+  }
+
+#if XNN_ENABLE_FULL_BENCHMARKS
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_1x16c8__avx512skx);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_2x16c8__avx512skx);
+#endif  // XNN_ENABLE_FULL_BENCHMARKS
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_3x16c8__avx512skx);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_4x16c8__avx512skx);
+
+#if XNN_ENABLE_FULL_BENCHMARKS
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_1x8c8__avx2);
+#endif  // XNN_ENABLE_FULL_BENCHMARKS
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_2x8c8__avx2);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_3x8c8__avx2);
+
+#if XNN_ENABLE_FULL_BENCHMARKS
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_1x4c2__xop_ld64);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_1x4c2__xop_ld128);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_1x4c8__xop_ld64);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_1x4c8__xop_ld128);
+#endif  // XNN_ENABLE_FULL_BENCHMARKS
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_2x4c8__xop_ld64);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_2x4c8__xop_ld128);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_3x4c8__xop_ld64);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_3x4c8__xop_ld128);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_4x4c2__xop_ld64);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_4x4c2__xop_ld128);
+
+#if XNN_ENABLE_FULL_BENCHMARKS
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128);
+#endif  // XNN_ENABLE_FULL_BENCHMARKS
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128);
+
+#if XNN_ENABLE_FULL_BENCHMARKS
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_1x4c2__ssse3_ld64);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_1x4c2__ssse3_ld128);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_1x4c8__ssse3_ld64);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_1x4c8__ssse3_ld128);
+#endif  // XNN_ENABLE_FULL_BENCHMARKS
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_2x4c8__ssse3_ld64);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_2x4c8__ssse3_ld128);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_3x4c8__ssse3_ld64);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_3x4c8__ssse3_ld128);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld64);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld128);
+
+#if XNN_ENABLE_FULL_BENCHMARKS
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_1x4c2__sse2_ld64);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_1x4c2__sse2_ld128);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_1x4c8__sse2_ld64);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_1x4c8__sse2_ld128);
+#endif  // XNN_ENABLE_FULL_BENCHMARKS
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_2x4c8__sse2_ld64);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_2x4c8__sse2_ld128);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_3x4c8__sse2_ld64);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_3x4c8__sse2_ld128);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_4x4c2__sse2_ld64);
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_4x4c2__sse2_ld128);
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ARCH_WASMSIMD
+#if XNN_ENABLE_FULL_BENCHMARKS
+  static void qs8_gemm_minmax_ukernel_1x4c8__wasmsimd_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_1x4c8__wasmsimd_ld64,
+      xnn_qs8_igemm_minmax_ukernel_1x4c8__wasmsimd_ld64,
+      xnn_qs8_gemm_minmax_ukernel_1x4c8__wasmsimd_ld64,
+      xnn_qs8_igemm_minmax_ukernel_1x4c8__wasmsimd_ld64,
+      1 /* mr */, 4 /* nr */, 3 /* log2_kr */);
+  }
+
+  static void qs8_gemm_minmax_ukernel_1x4c8__wasmsimd_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_1x4c8__wasmsimd_ld128,
+      xnn_qs8_igemm_minmax_ukernel_1x4c8__wasmsimd_ld128,
+      xnn_qs8_gemm_minmax_ukernel_1x4c8__wasmsimd_ld128,
+      xnn_qs8_igemm_minmax_ukernel_1x4c8__wasmsimd_ld128,
+      1 /* mr */, 4 /* nr */, 3 /* log2_kr */);
+  }
+#endif  // XNN_ENABLE_FULL_BENCHMARKS
+
+  static void qs8_gemm_minmax_ukernel_2x4c8__wasmsimd_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_2x4c8__wasmsimd_ld64,
+      xnn_qs8_igemm_minmax_ukernel_2x4c8__wasmsimd_ld64,
+      xnn_qs8_gemm_minmax_ukernel_1x4c8__wasmsimd_ld64,
+      xnn_qs8_igemm_minmax_ukernel_1x4c8__wasmsimd_ld64,
+      2 /* mr */, 4 /* nr */, 3 /* log2_kr */);
+  }
+
+  static void qs8_gemm_minmax_ukernel_2x4c8__wasmsimd_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_2x4c8__wasmsimd_ld128,
+      xnn_qs8_igemm_minmax_ukernel_2x4c8__wasmsimd_ld128,
+      xnn_qs8_gemm_minmax_ukernel_1x4c8__wasmsimd_ld128,
+      xnn_qs8_igemm_minmax_ukernel_1x4c8__wasmsimd_ld128,
+      2 /* mr */, 4 /* nr */, 3 /* log2_kr */);
+  }
+
+  static void qs8_gemm_minmax_ukernel_3x4c8__wasmsimd_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_3x4c8__wasmsimd_ld64,
+      xnn_qs8_igemm_minmax_ukernel_3x4c8__wasmsimd_ld64,
+      xnn_qs8_gemm_minmax_ukernel_1x4c8__wasmsimd_ld64,
+      xnn_qs8_igemm_minmax_ukernel_1x4c8__wasmsimd_ld64,
+      3 /* mr */, 4 /* nr */, 3 /* log2_kr */);
+  }
+
+  static void qs8_gemm_minmax_ukernel_3x4c8__wasmsimd_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_ukernel_3x4c8__wasmsimd_ld128,
+      xnn_qs8_igemm_minmax_ukernel_3x4c8__wasmsimd_ld128,
+      xnn_qs8_gemm_minmax_ukernel_1x4c8__wasmsimd_ld128,
+      xnn_qs8_igemm_minmax_ukernel_1x4c8__wasmsimd_ld128,
+      3 /* mr */, 4 /* nr */, 3 /* log2_kr */);
+  }
+
+#if XNN_ENABLE_FULL_BENCHMARKS
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_1x4c8__wasmsimd_ld64)
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_1x4c8__wasmsimd_ld128)
+#endif  // XNN_ENABLE_FULL_BENCHMARKS
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_2x4c8__wasmsimd_ld64)
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_2x4c8__wasmsimd_ld128)
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_3x4c8__wasmsimd_ld64)
+  BENCHMARK_QS8_END2END(qs8_gemm_minmax_ukernel_3x4c8__wasmsimd_ld128)
+#endif  // XNN_ARCH_WASMSIMD
+
+#ifndef XNNPACK_BENCHMARK_NO_MAIN
+BENCHMARK_MAIN();
+#endif
diff --git a/bench/qs8-gemm.cc b/bench/qs8-gemm.cc
index f50452c..8a0d8f1 100644
--- a/bench/qs8-gemm.cc
+++ b/bench/qs8-gemm.cc
@@ -284,12 +284,174 @@
 #endif  // BENCHMARK_RUY
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  static void qs8_gemm_1x8__neon_mlal_lane(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_1x8__neon_mlal_lane, 1, 8, 1, 1, benchmark::utils::CheckNEON);
+  }
   static void qs8_gemm_2x8__neon_mlal_lane(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane, 2, 8, 1, 1, benchmark::utils::CheckNEON);
   }
+  static void qs8_gemm_3x8__neon_mlal_lane(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane, 3, 8, 1, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_4x8__neon_mlal_lane(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane, 4, 8, 1, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_1x16__neon_mlal_lane(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane, 1, 16, 1, 1, benchmark::utils::CheckNEON);
+  }
   static void qs8_gemm_2x16__neon_mlal_lane(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane, 2, 16, 1, 1, benchmark::utils::CheckNEON);
   }
+  static void qs8_gemm_3x16__neon_mlal_lane(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane, 3, 16, 1, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_4x16__neon_mlal_lane(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane, 4, 16, 1, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_1x8__neon_mull_addw_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup, 1, 8, 1, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_2x8__neon_mull_addw_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup, 2, 8, 1, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_3x8__neon_mull_addw_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup, 3, 8, 1, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_4x8__neon_mull_addw_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup, 4, 8, 1, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_1x16__neon_mull_addw_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup, 1, 16, 1, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_2x16__neon_mull_addw_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup, 2, 16, 1, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_3x16__neon_mull_addw_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup, 3, 16, 1, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_4x16__neon_mull_addw_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup, 4, 16, 1, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_1x8c2__neon_mull_padal_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mull_padal_dup, 1, 8, 2, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_2x8c2__neon_mull_padal_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mull_padal_dup, 2, 8, 2, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_3x8c2__neon_mull_padal_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup, 3, 8, 2, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_4x8c2__neon_mull_padal_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mull_padal_dup, 4, 8, 2, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_1x16c2__neon_mull_padal_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup, 1, 16, 2, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_2x16c2__neon_mull_padal_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup, 2, 16, 2, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_3x16c2__neon_mull_padal_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup, 3, 16, 2, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_4x16c2__neon_mull_padal_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup, 4, 16, 2, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_1x8c2__neon_mlal_padal_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup, 1, 8, 2, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_2x8c2__neon_mlal_padal_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup, 2, 8, 2, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_3x8c2__neon_mlal_padal_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup, 3, 8, 2, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_4x8c2__neon_mlal_padal_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup, 4, 8, 2, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_1x16c2__neon_mlal_padal_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup, 1, 16, 2, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_2x16c2__neon_mlal_padal_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup, 2, 16, 2, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_3x16c2__neon_mlal_padal_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup, 3, 16, 2, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_4x16c2__neon_mlal_padal_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup, 4, 16, 2, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_1x8c8__neon_mull_padal(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mull_padal, 1, 8, 8, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_2x8c8__neon_mull_padal(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mull_padal, 2, 8, 8, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_3x8c8__neon_mull_padal(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal, 3, 8, 8, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_4x8c8__neon_mull_padal(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal, 4, 8, 8, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_1x16c8__neon_mull_padal(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal, 1, 16, 8, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_2x16c8__neon_mull_padal(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal, 2, 16, 8, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_3x16c8__neon_mull_padal(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal, 3, 16, 8, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_4x16c8__neon_mull_padal(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal, 4, 16, 8, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_1x8c8__neon_mlal_padal(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal, 1, 8, 8, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_2x8c8__neon_mlal_padal(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal, 2, 8, 8, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_3x8c8__neon_mlal_padal(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal, 3, 8, 8, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_4x8c8__neon_mlal_padal(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal, 4, 8, 8, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_1x16c8__neon_mlal_padal(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal, 1, 16, 8, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_2x16c8__neon_mlal_padal(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal, 2, 16, 8, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_3x16c8__neon_mlal_padal(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal, 3, 16, 8, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_4x16c8__neon_mlal_padal(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal, 4, 16, 8, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_1x8c16__neon_mlal_padal(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal, 1, 8, 16, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_2x8c16__neon_mlal_padal(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal, 2, 8, 16, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_3x8c16__neon_mlal_padal(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal, 3, 8, 16, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_4x8c16__neon_mlal_padal(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal, 4, 8, 16, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_1x16c16__neon_mlal_padal(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal, 1, 16, 16, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_2x16c16__neon_mlal_padal(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal, 2, 16, 16, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_3x16c16__neon_mlal_padal(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal, 3, 16, 16, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_4x16c16__neon_mlal_padal(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal, 4, 16, 16, 1, benchmark::utils::CheckNEON);
+  }
   static void qs8_gemm_1x8c4__neondot(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_1x8c4__neondot, 1, 8, 4, 1, benchmark::utils::CheckNEONDOT);
   }
@@ -302,9 +464,6 @@
   static void qs8_gemm_8x8c4__neondot(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_8x8c4__neondot, 8, 8, 4, 1, benchmark::utils::CheckNEONDOT);
   }
-  static void qs8_gemm_12x8c4__neondot(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_12x8c4__neondot, 12, 8, 4, 1, benchmark::utils::CheckNEONDOT);
-  }
   static void qs8_gemm_1x16c4__neondot(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot, 1, 16, 4, 1, benchmark::utils::CheckNEONDOT);
   }
@@ -318,13 +477,66 @@
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot, 8, 16, 4, 1, benchmark::utils::CheckNEONDOT);
   }
 
+  BENCHMARK_GEMM(qs8_gemm_1x8__neon_mlal_lane)
   BENCHMARK_GEMM(qs8_gemm_2x8__neon_mlal_lane)
+  BENCHMARK_GEMM(qs8_gemm_3x8__neon_mlal_lane)
+  BENCHMARK_GEMM(qs8_gemm_4x8__neon_mlal_lane)
+  BENCHMARK_GEMM(qs8_gemm_1x16__neon_mlal_lane)
   BENCHMARK_GEMM(qs8_gemm_2x16__neon_mlal_lane)
+  BENCHMARK_GEMM(qs8_gemm_3x16__neon_mlal_lane)
+  BENCHMARK_GEMM(qs8_gemm_4x16__neon_mlal_lane)
+  BENCHMARK_GEMM(qs8_gemm_1x8__neon_mull_addw_dup)
+  BENCHMARK_GEMM(qs8_gemm_2x8__neon_mull_addw_dup)
+  BENCHMARK_GEMM(qs8_gemm_3x8__neon_mull_addw_dup)
+  BENCHMARK_GEMM(qs8_gemm_4x8__neon_mull_addw_dup)
+  BENCHMARK_GEMM(qs8_gemm_1x16__neon_mull_addw_dup)
+  BENCHMARK_GEMM(qs8_gemm_2x16__neon_mull_addw_dup)
+  BENCHMARK_GEMM(qs8_gemm_3x16__neon_mull_addw_dup)
+  BENCHMARK_GEMM(qs8_gemm_4x16__neon_mull_addw_dup)
+  BENCHMARK_GEMM(qs8_gemm_1x8c2__neon_mull_padal_dup)
+  BENCHMARK_GEMM(qs8_gemm_2x8c2__neon_mull_padal_dup)
+  BENCHMARK_GEMM(qs8_gemm_3x8c2__neon_mull_padal_dup)
+  BENCHMARK_GEMM(qs8_gemm_4x8c2__neon_mull_padal_dup)
+  BENCHMARK_GEMM(qs8_gemm_1x16c2__neon_mull_padal_dup)
+  BENCHMARK_GEMM(qs8_gemm_2x16c2__neon_mull_padal_dup)
+  BENCHMARK_GEMM(qs8_gemm_3x16c2__neon_mull_padal_dup)
+  BENCHMARK_GEMM(qs8_gemm_4x16c2__neon_mull_padal_dup)
+  BENCHMARK_GEMM(qs8_gemm_1x8c2__neon_mlal_padal_dup)
+  BENCHMARK_GEMM(qs8_gemm_2x8c2__neon_mlal_padal_dup)
+  BENCHMARK_GEMM(qs8_gemm_3x8c2__neon_mlal_padal_dup)
+  BENCHMARK_GEMM(qs8_gemm_4x8c2__neon_mlal_padal_dup)
+  BENCHMARK_GEMM(qs8_gemm_1x16c2__neon_mlal_padal_dup)
+  BENCHMARK_GEMM(qs8_gemm_2x16c2__neon_mlal_padal_dup)
+  BENCHMARK_GEMM(qs8_gemm_3x16c2__neon_mlal_padal_dup)
+  BENCHMARK_GEMM(qs8_gemm_4x16c2__neon_mlal_padal_dup)
+  BENCHMARK_GEMM(qs8_gemm_1x8c8__neon_mull_padal)
+  BENCHMARK_GEMM(qs8_gemm_2x8c8__neon_mull_padal)
+  BENCHMARK_GEMM(qs8_gemm_3x8c8__neon_mull_padal)
+  BENCHMARK_GEMM(qs8_gemm_4x8c8__neon_mull_padal)
+  BENCHMARK_GEMM(qs8_gemm_1x16c8__neon_mull_padal)
+  BENCHMARK_GEMM(qs8_gemm_2x16c8__neon_mull_padal)
+  BENCHMARK_GEMM(qs8_gemm_3x16c8__neon_mull_padal)
+  BENCHMARK_GEMM(qs8_gemm_4x16c8__neon_mull_padal)
+  BENCHMARK_GEMM(qs8_gemm_1x8c8__neon_mlal_padal)
+  BENCHMARK_GEMM(qs8_gemm_2x8c8__neon_mlal_padal)
+  BENCHMARK_GEMM(qs8_gemm_3x8c8__neon_mlal_padal)
+  BENCHMARK_GEMM(qs8_gemm_4x8c8__neon_mlal_padal)
+  BENCHMARK_GEMM(qs8_gemm_1x16c8__neon_mlal_padal)
+  BENCHMARK_GEMM(qs8_gemm_2x16c8__neon_mlal_padal)
+  BENCHMARK_GEMM(qs8_gemm_3x16c8__neon_mlal_padal)
+  BENCHMARK_GEMM(qs8_gemm_4x16c8__neon_mlal_padal)
+  BENCHMARK_GEMM(qs8_gemm_1x8c16__neon_mlal_padal)
+  BENCHMARK_GEMM(qs8_gemm_2x8c16__neon_mlal_padal)
+  BENCHMARK_GEMM(qs8_gemm_3x8c16__neon_mlal_padal)
+  BENCHMARK_GEMM(qs8_gemm_4x8c16__neon_mlal_padal)
+  BENCHMARK_GEMM(qs8_gemm_1x16c16__neon_mlal_padal)
+  BENCHMARK_GEMM(qs8_gemm_2x16c16__neon_mlal_padal)
+  BENCHMARK_GEMM(qs8_gemm_3x16c16__neon_mlal_padal)
+  BENCHMARK_GEMM(qs8_gemm_4x16c16__neon_mlal_padal)
   BENCHMARK_GEMM(qs8_gemm_1x8c4__neondot)
   BENCHMARK_GEMM(qs8_gemm_4x8c4__neondot)
   BENCHMARK_GEMM(qs8_gemm_6x8c4__neondot)
   BENCHMARK_GEMM(qs8_gemm_8x8c4__neondot)
-  BENCHMARK_GEMM(qs8_gemm_12x8c4__neondot)
   BENCHMARK_GEMM(qs8_gemm_1x16c4__neondot)
   BENCHMARK_GEMM(qs8_gemm_4x16c4__neondot)
   BENCHMARK_GEMM(qs8_gemm_6x16c4__neondot)
@@ -332,22 +544,41 @@
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 #if XNN_ARCH_ARM64
+  static void qs8_gemm_4x16c4__aarch64_neondot_cortex_a55(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55, 4, 16, 4, 1, benchmark::utils::CheckNEONDOT);
+  }
+  static void qs8_gemm_1x16c4__aarch64_neondot_ld32(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld32, 1, 16, 4, 1, benchmark::utils::CheckNEONDOT);
+  }
   static void qs8_gemm_1x16c4__aarch64_neondot_ld64(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld64, 1, 16, 4, 1, benchmark::utils::CheckNEONDOT);
   }
+  static void qs8_gemm_4x16c4__aarch64_neondot_ld32(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld32, 4, 16, 4, 1, benchmark::utils::CheckNEONDOT);
+  }
   static void qs8_gemm_4x16c4__aarch64_neondot_ld64(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64, 4, 16, 4, 1, benchmark::utils::CheckNEONDOT);
   }
-  static void qs8_gemm_4x16c4__aarch64_neondot_cortex_a55(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55, 4, 16, 4, 1, benchmark::utils::CheckNEONDOT);
+  static void qs8_gemm_2x8c8__aarch64_neon_mull_padal(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mull_padal, 2, 8, 8, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_2x8c8__aarch64_neon_mlal_padal(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal, 2, 8, 8, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_2x8c16__aarch64_neon_mlal_padal(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal, 2, 8, 16, 1, benchmark::utils::CheckNEON);
   }
 
+  BENCHMARK_GEMM(qs8_gemm_1x16c4__aarch64_neondot_ld32)
   BENCHMARK_GEMM(qs8_gemm_1x16c4__aarch64_neondot_ld64)
+  BENCHMARK_GEMM(qs8_gemm_4x16c4__aarch64_neondot_ld32)
   BENCHMARK_GEMM(qs8_gemm_4x16c4__aarch64_neondot_ld64)
   BENCHMARK_GEMM(qs8_gemm_4x16c4__aarch64_neondot_cortex_a55)
+  BENCHMARK_GEMM(qs8_gemm_2x8c8__aarch64_neon_mull_padal)
+  BENCHMARK_GEMM(qs8_gemm_2x8c8__aarch64_neon_mlal_padal)
+  BENCHMARK_GEMM(qs8_gemm_2x8c16__aarch64_neon_mlal_padal)
 #endif  // XNN_ARCH_ARM64
 
-
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   static void qs8_gemm_4x4c2__sse2_ld64(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld64, 4, 4, 2, 1);
diff --git a/build_defs.bzl b/build_defs.bzl
index 1ce9482..af543b9 100644
--- a/build_defs.bzl
+++ b/build_defs.bzl
@@ -146,6 +146,7 @@
             ":linux_armv7a": aarch32_srcs,
             ":linux_aarch64": aarch64_srcs,
             ":macos_x86_64": x86_srcs,
+            ":macos_arm64": aarch64_srcs,
             ":windows_x86_64_clang": x86_srcs,
             ":windows_x86_64_mingw": x86_srcs,
             ":windows_x86_64_msys": x86_srcs,
@@ -180,6 +181,7 @@
             ":linux_armv7a": aarch32_copts,
             ":linux_aarch64": aarch64_copts,
             ":macos_x86_64": gcc_x86_copts,
+            ":macos_arm64": aarch64_copts,
             ":windows_x86_64_clang": ["/clang:" + opt for opt in gcc_x86_copts],
             ":windows_x86_64_mingw": mingw_copts + gcc_x86_copts,
             ":windows_x86_64_msys": msys_copts + gcc_x86_copts,
@@ -261,6 +263,7 @@
             ":linux_armv7a": aarch32_deps,
             ":linux_aarch64": aarch64_deps,
             ":macos_x86_64": x86_deps,
+            ":macos_arm64": aarch64_deps,
             ":windows_x86_64_clang": x86_deps,
             ":windows_x86_64_mingw": x86_deps,
             ":windows_x86_64_msys": x86_deps,
diff --git a/emscripten.bzl b/emscripten.bzl
index faad087..0a0caed 100644
--- a/emscripten.bzl
+++ b/emscripten.bzl
@@ -26,7 +26,7 @@
         "-s ERROR_ON_UNDEFINED_SYMBOLS=1",
         "-s EXIT_RUNTIME=1",
         "-s ALLOW_MEMORY_GROWTH=1",
-        "-s TOTAL_MEMORY=268435456",  # 256M
+        "-s TOTAL_MEMORY=436207616",  # 416M
         "--pre-js $(location :preamble.js.lds)",
     ]
 
diff --git a/models/fp16-mobilenet-v1.cc b/models/fp16-mobilenet-v1.cc
index 612cfe6..d1c902d 100644
--- a/models/fp16-mobilenet-v1.cc
+++ b/models/fp16-mobilenet-v1.cc
@@ -5,6 +5,7 @@
 
 #include <xnnpack.h>
 
+#include <array>
 #include <algorithm>
 #include <functional>
 #include <iostream>
@@ -18,183 +19,183 @@
 namespace models {
 
 ExecutionPlan FP16MobileNetV1(pthreadpool_t threadpool) {
-  alignas(16) static uint16_t v0[150528];
-  alignas(16) static uint16_t v1[401408];
-  alignas(16) static uint16_t v2[401408];
-  alignas(16) static uint16_t v3[802816];
-  alignas(16) static uint16_t v4[200704];
-  alignas(16) static uint16_t v5[401408];
-  alignas(16) static uint16_t v6[401408];
-  alignas(16) static uint16_t v7[401408];
-  alignas(16) static uint16_t v8[100352];
-  alignas(16) static uint16_t v9[200704];
-  alignas(16) static uint16_t v10[200704];
-  alignas(16) static uint16_t v11[200704];
-  alignas(16) static uint16_t v12[50176];
-  alignas(16) static uint16_t v13[100352];
-  alignas(16) static uint16_t v14[100352];
-  alignas(16) static uint16_t v15[100352];
-  alignas(16) static uint16_t v16[100352];
-  alignas(16) static uint16_t v17[100352];
-  alignas(16) static uint16_t v18[100352];
-  alignas(16) static uint16_t v19[100352];
-  alignas(16) static uint16_t v20[100352];
-  alignas(16) static uint16_t v21[100352];
-  alignas(16) static uint16_t v22[100352];
-  alignas(16) static uint16_t v23[100352];
-  alignas(16) static uint16_t v24[25088];
-  alignas(16) static uint16_t v25[50176];
-  alignas(16) static uint16_t v26[50176];
-  alignas(16) static uint16_t v27[50176];
-  alignas(16) static uint16_t v28[1024];
-  alignas(16) static uint16_t v29[1001];
-  alignas(16) static uint16_t w30[864];
-  alignas(16) static uint16_t w31[32];
-  alignas(16) static uint16_t w32[288];
-  alignas(16) static uint16_t w33[32];
-  alignas(16) static uint16_t w34[2048];
-  alignas(16) static uint16_t w35[64];
-  alignas(16) static uint16_t w36[576];
-  alignas(16) static uint16_t w37[64];
-  alignas(16) static uint16_t w38[8192];
-  alignas(16) static uint16_t w39[128];
-  alignas(16) static uint16_t w40[1152];
-  alignas(16) static uint16_t w41[128];
-  alignas(16) static uint16_t w42[16384];
-  alignas(16) static uint16_t w43[128];
-  alignas(16) static uint16_t w44[1152];
-  alignas(16) static uint16_t w45[128];
-  alignas(16) static uint16_t w46[32768];
-  alignas(16) static uint16_t w47[256];
-  alignas(16) static uint16_t w48[2304];
-  alignas(16) static uint16_t w49[256];
-  alignas(16) static uint16_t w50[65536];
-  alignas(16) static uint16_t w51[256];
-  alignas(16) static uint16_t w52[2304];
-  alignas(16) static uint16_t w53[256];
-  alignas(16) static uint16_t w54[131072];
-  alignas(16) static uint16_t w55[512];
-  alignas(16) static uint16_t w56[4608];
-  alignas(16) static uint16_t w57[512];
-  alignas(16) static uint16_t w58[262144];
-  alignas(16) static uint16_t w59[512];
-  alignas(16) static uint16_t w60[4608];
-  alignas(16) static uint16_t w61[512];
-  alignas(16) static uint16_t w62[262144];
-  alignas(16) static uint16_t w63[512];
-  alignas(16) static uint16_t w64[4608];
-  alignas(16) static uint16_t w65[512];
-  alignas(16) static uint16_t w66[262144];
-  alignas(16) static uint16_t w67[512];
-  alignas(16) static uint16_t w68[4608];
-  alignas(16) static uint16_t w69[512];
-  alignas(16) static uint16_t w70[262144];
-  alignas(16) static uint16_t w71[512];
-  alignas(16) static uint16_t w72[4608];
-  alignas(16) static uint16_t w73[512];
-  alignas(16) static uint16_t w74[262144];
-  alignas(16) static uint16_t w75[512];
-  alignas(16) static uint16_t w76[4608];
-  alignas(16) static uint16_t w77[512];
-  alignas(16) static uint16_t w78[524288];
-  alignas(16) static uint16_t w79[1024];
-  alignas(16) static uint16_t w80[9216];
-  alignas(16) static uint16_t w81[1024];
-  alignas(16) static uint16_t w82[1048576];
-  alignas(16) static uint16_t w83[1024];
-  alignas(16) static uint16_t w84[1025024];
-  alignas(16) static uint16_t w85[1001];
+  alignas(16) static std::array<uint16_t, 150528> v0;
+  alignas(16) static std::array<uint16_t, 401408> v1;
+  alignas(16) static std::array<uint16_t, 401408> v2;
+  alignas(16) static std::array<uint16_t, 802816> v3;
+  alignas(16) static std::array<uint16_t, 200704> v4;
+  alignas(16) static std::array<uint16_t, 401408> v5;
+  alignas(16) static std::array<uint16_t, 401408> v6;
+  alignas(16) static std::array<uint16_t, 401408> v7;
+  alignas(16) static std::array<uint16_t, 100352> v8;
+  alignas(16) static std::array<uint16_t, 200704> v9;
+  alignas(16) static std::array<uint16_t, 200704> v10;
+  alignas(16) static std::array<uint16_t, 200704> v11;
+  alignas(16) static std::array<uint16_t, 50176> v12;
+  alignas(16) static std::array<uint16_t, 100352> v13;
+  alignas(16) static std::array<uint16_t, 100352> v14;
+  alignas(16) static std::array<uint16_t, 100352> v15;
+  alignas(16) static std::array<uint16_t, 100352> v16;
+  alignas(16) static std::array<uint16_t, 100352> v17;
+  alignas(16) static std::array<uint16_t, 100352> v18;
+  alignas(16) static std::array<uint16_t, 100352> v19;
+  alignas(16) static std::array<uint16_t, 100352> v20;
+  alignas(16) static std::array<uint16_t, 100352> v21;
+  alignas(16) static std::array<uint16_t, 100352> v22;
+  alignas(16) static std::array<uint16_t, 100352> v23;
+  alignas(16) static std::array<uint16_t, 25088> v24;
+  alignas(16) static std::array<uint16_t, 50176> v25;
+  alignas(16) static std::array<uint16_t, 50176> v26;
+  alignas(16) static std::array<uint16_t, 50176> v27;
+  alignas(16) static std::array<uint16_t, 1024> v28;
+  alignas(16) static std::array<uint16_t, 1001> v29;
+  alignas(16) static std::array<uint16_t, 864> w30;
+  alignas(16) static std::array<uint16_t, 32> w31;
+  alignas(16) static std::array<uint16_t, 288> w32;
+  alignas(16) static std::array<uint16_t, 32> w33;
+  alignas(16) static std::array<uint16_t, 2048> w34;
+  alignas(16) static std::array<uint16_t, 64> w35;
+  alignas(16) static std::array<uint16_t, 576> w36;
+  alignas(16) static std::array<uint16_t, 64> w37;
+  alignas(16) static std::array<uint16_t, 8192> w38;
+  alignas(16) static std::array<uint16_t, 128> w39;
+  alignas(16) static std::array<uint16_t, 1152> w40;
+  alignas(16) static std::array<uint16_t, 128> w41;
+  alignas(16) static std::array<uint16_t, 16384> w42;
+  alignas(16) static std::array<uint16_t, 128> w43;
+  alignas(16) static std::array<uint16_t, 1152> w44;
+  alignas(16) static std::array<uint16_t, 128> w45;
+  alignas(16) static std::array<uint16_t, 32768> w46;
+  alignas(16) static std::array<uint16_t, 256> w47;
+  alignas(16) static std::array<uint16_t, 2304> w48;
+  alignas(16) static std::array<uint16_t, 256> w49;
+  alignas(16) static std::array<uint16_t, 65536> w50;
+  alignas(16) static std::array<uint16_t, 256> w51;
+  alignas(16) static std::array<uint16_t, 2304> w52;
+  alignas(16) static std::array<uint16_t, 256> w53;
+  alignas(16) static std::array<uint16_t, 131072> w54;
+  alignas(16) static std::array<uint16_t, 512> w55;
+  alignas(16) static std::array<uint16_t, 4608> w56;
+  alignas(16) static std::array<uint16_t, 512> w57;
+  alignas(16) static std::array<uint16_t, 262144> w58;
+  alignas(16) static std::array<uint16_t, 512> w59;
+  alignas(16) static std::array<uint16_t, 4608> w60;
+  alignas(16) static std::array<uint16_t, 512> w61;
+  alignas(16) static std::array<uint16_t, 262144> w62;
+  alignas(16) static std::array<uint16_t, 512> w63;
+  alignas(16) static std::array<uint16_t, 4608> w64;
+  alignas(16) static std::array<uint16_t, 512> w65;
+  alignas(16) static std::array<uint16_t, 262144> w66;
+  alignas(16) static std::array<uint16_t, 512> w67;
+  alignas(16) static std::array<uint16_t, 4608> w68;
+  alignas(16) static std::array<uint16_t, 512> w69;
+  alignas(16) static std::array<uint16_t, 262144> w70;
+  alignas(16) static std::array<uint16_t, 512> w71;
+  alignas(16) static std::array<uint16_t, 4608> w72;
+  alignas(16) static std::array<uint16_t, 512> w73;
+  alignas(16) static std::array<uint16_t, 262144> w74;
+  alignas(16) static std::array<uint16_t, 512> w75;
+  alignas(16) static std::array<uint16_t, 4608> w76;
+  alignas(16) static std::array<uint16_t, 512> w77;
+  alignas(16) static std::array<uint16_t, 524288> w78;
+  alignas(16) static std::array<uint16_t, 1024> w79;
+  alignas(16) static std::array<uint16_t, 9216> w80;
+  alignas(16) static std::array<uint16_t, 1024> w81;
+  alignas(16) static std::array<uint16_t, 1048576> w82;
+  alignas(16) static std::array<uint16_t, 1024> w83;
+  alignas(16) static std::array<uint16_t, 1025024> w84;
+  alignas(16) static std::array<uint16_t, 1001> w85;
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.0f, +1.0f), std::ref(rng));
   auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
-  std::generate(v0, v0 + 150528, std::ref(f16rng));
-  std::generate(v1, v1 + 401408, std::ref(f16rng));
-  std::generate(v2, v2 + 401408, std::ref(f16rng));
-  std::generate(v3, v3 + 802816, std::ref(f16rng));
-  std::generate(v4, v4 + 200704, std::ref(f16rng));
-  std::generate(v5, v5 + 401408, std::ref(f16rng));
-  std::generate(v6, v6 + 401408, std::ref(f16rng));
-  std::generate(v7, v7 + 401408, std::ref(f16rng));
-  std::generate(v8, v8 + 100352, std::ref(f16rng));
-  std::generate(v9, v9 + 200704, std::ref(f16rng));
-  std::generate(v10, v10 + 200704, std::ref(f16rng));
-  std::generate(v11, v11 + 200704, std::ref(f16rng));
-  std::generate(v12, v12 + 50176, std::ref(f16rng));
-  std::generate(v13, v13 + 100352, std::ref(f16rng));
-  std::generate(v14, v14 + 100352, std::ref(f16rng));
-  std::generate(v15, v15 + 100352, std::ref(f16rng));
-  std::generate(v16, v16 + 100352, std::ref(f16rng));
-  std::generate(v17, v17 + 100352, std::ref(f16rng));
-  std::generate(v18, v18 + 100352, std::ref(f16rng));
-  std::generate(v19, v19 + 100352, std::ref(f16rng));
-  std::generate(v20, v20 + 100352, std::ref(f16rng));
-  std::generate(v21, v21 + 100352, std::ref(f16rng));
-  std::generate(v22, v22 + 100352, std::ref(f16rng));
-  std::generate(v23, v23 + 100352, std::ref(f16rng));
-  std::generate(v24, v24 + 25088, std::ref(f16rng));
-  std::generate(v25, v25 + 50176, std::ref(f16rng));
-  std::generate(v26, v26 + 50176, std::ref(f16rng));
-  std::generate(v27, v27 + 50176, std::ref(f16rng));
-  std::generate(v28, v28 + 1024, std::ref(f16rng));
-  std::generate(v29, v29 + 1001, std::ref(f16rng));
-  std::generate(w30, w30 + 864, std::ref(f16rng));
-  std::generate(w31, w31 + 32, std::ref(f16rng));
-  std::generate(w32, w32 + 288, std::ref(f16rng));
-  std::generate(w33, w33 + 32, std::ref(f16rng));
-  std::generate(w34, w34 + 2048, std::ref(f16rng));
-  std::generate(w35, w35 + 64, std::ref(f16rng));
-  std::generate(w36, w36 + 576, std::ref(f16rng));
-  std::generate(w37, w37 + 64, std::ref(f16rng));
-  std::generate(w38, w38 + 8192, std::ref(f16rng));
-  std::generate(w39, w39 + 128, std::ref(f16rng));
-  std::generate(w40, w40 + 1152, std::ref(f16rng));
-  std::generate(w41, w41 + 128, std::ref(f16rng));
-  std::generate(w42, w42 + 16384, std::ref(f16rng));
-  std::generate(w43, w43 + 128, std::ref(f16rng));
-  std::generate(w44, w44 + 1152, std::ref(f16rng));
-  std::generate(w45, w45 + 128, std::ref(f16rng));
-  std::generate(w46, w46 + 32768, std::ref(f16rng));
-  std::generate(w47, w47 + 256, std::ref(f16rng));
-  std::generate(w48, w48 + 2304, std::ref(f16rng));
-  std::generate(w49, w49 + 256, std::ref(f16rng));
-  std::generate(w50, w50 + 65536, std::ref(f16rng));
-  std::generate(w51, w51 + 256, std::ref(f16rng));
-  std::generate(w52, w52 + 2304, std::ref(f16rng));
-  std::generate(w53, w53 + 256, std::ref(f16rng));
-  std::generate(w54, w54 + 131072, std::ref(f16rng));
-  std::generate(w55, w55 + 512, std::ref(f16rng));
-  std::generate(w56, w56 + 4608, std::ref(f16rng));
-  std::generate(w57, w57 + 512, std::ref(f16rng));
-  std::generate(w58, w58 + 262144, std::ref(f16rng));
-  std::generate(w59, w59 + 512, std::ref(f16rng));
-  std::generate(w60, w60 + 4608, std::ref(f16rng));
-  std::generate(w61, w61 + 512, std::ref(f16rng));
-  std::generate(w62, w62 + 262144, std::ref(f16rng));
-  std::generate(w63, w63 + 512, std::ref(f16rng));
-  std::generate(w64, w64 + 4608, std::ref(f16rng));
-  std::generate(w65, w65 + 512, std::ref(f16rng));
-  std::generate(w66, w66 + 262144, std::ref(f16rng));
-  std::generate(w67, w67 + 512, std::ref(f16rng));
-  std::generate(w68, w68 + 4608, std::ref(f16rng));
-  std::generate(w69, w69 + 512, std::ref(f16rng));
-  std::generate(w70, w70 + 262144, std::ref(f16rng));
-  std::generate(w71, w71 + 512, std::ref(f16rng));
-  std::generate(w72, w72 + 4608, std::ref(f16rng));
-  std::generate(w73, w73 + 512, std::ref(f16rng));
-  std::generate(w74, w74 + 262144, std::ref(f16rng));
-  std::generate(w75, w75 + 512, std::ref(f16rng));
-  std::generate(w76, w76 + 4608, std::ref(f16rng));
-  std::generate(w77, w77 + 512, std::ref(f16rng));
-  std::generate(w78, w78 + 524288, std::ref(f16rng));
-  std::generate(w79, w79 + 1024, std::ref(f16rng));
-  std::generate(w80, w80 + 9216, std::ref(f16rng));
-  std::generate(w81, w81 + 1024, std::ref(f16rng));
-  std::generate(w82, w82 + 1048576, std::ref(f16rng));
-  std::generate(w83, w83 + 1024, std::ref(f16rng));
-  std::generate(w84, w84 + 1025024, std::ref(f16rng));
-  std::generate(w85, w85 + 1001, std::ref(f16rng));
+  std::generate(v0.begin(), v0.end(), std::ref(f16rng));
+  std::generate(v1.begin(), v1.end(), std::ref(f16rng));
+  std::generate(v2.begin(), v2.end(), std::ref(f16rng));
+  std::generate(v3.begin(), v3.end(), std::ref(f16rng));
+  std::generate(v4.begin(), v4.end(), std::ref(f16rng));
+  std::generate(v5.begin(), v5.end(), std::ref(f16rng));
+  std::generate(v6.begin(), v6.end(), std::ref(f16rng));
+  std::generate(v7.begin(), v7.end(), std::ref(f16rng));
+  std::generate(v8.begin(), v8.end(), std::ref(f16rng));
+  std::generate(v9.begin(), v9.end(), std::ref(f16rng));
+  std::generate(v10.begin(), v10.end(), std::ref(f16rng));
+  std::generate(v11.begin(), v11.end(), std::ref(f16rng));
+  std::generate(v12.begin(), v12.end(), std::ref(f16rng));
+  std::generate(v13.begin(), v13.end(), std::ref(f16rng));
+  std::generate(v14.begin(), v14.end(), std::ref(f16rng));
+  std::generate(v15.begin(), v15.end(), std::ref(f16rng));
+  std::generate(v16.begin(), v16.end(), std::ref(f16rng));
+  std::generate(v17.begin(), v17.end(), std::ref(f16rng));
+  std::generate(v18.begin(), v18.end(), std::ref(f16rng));
+  std::generate(v19.begin(), v19.end(), std::ref(f16rng));
+  std::generate(v20.begin(), v20.end(), std::ref(f16rng));
+  std::generate(v21.begin(), v21.end(), std::ref(f16rng));
+  std::generate(v22.begin(), v22.end(), std::ref(f16rng));
+  std::generate(v23.begin(), v23.end(), std::ref(f16rng));
+  std::generate(v24.begin(), v24.end(), std::ref(f16rng));
+  std::generate(v25.begin(), v25.end(), std::ref(f16rng));
+  std::generate(v26.begin(), v26.end(), std::ref(f16rng));
+  std::generate(v27.begin(), v27.end(), std::ref(f16rng));
+  std::generate(v28.begin(), v28.end(), std::ref(f16rng));
+  std::generate(v29.begin(), v29.end(), std::ref(f16rng));
+  std::generate(w30.begin(), w30.end(), std::ref(f16rng));
+  std::generate(w31.begin(), w31.end(), std::ref(f16rng));
+  std::generate(w32.begin(), w32.end(), std::ref(f16rng));
+  std::generate(w33.begin(), w33.end(), std::ref(f16rng));
+  std::generate(w34.begin(), w34.end(), std::ref(f16rng));
+  std::generate(w35.begin(), w35.end(), std::ref(f16rng));
+  std::generate(w36.begin(), w36.end(), std::ref(f16rng));
+  std::generate(w37.begin(), w37.end(), std::ref(f16rng));
+  std::generate(w38.begin(), w38.end(), std::ref(f16rng));
+  std::generate(w39.begin(), w39.end(), std::ref(f16rng));
+  std::generate(w40.begin(), w40.end(), std::ref(f16rng));
+  std::generate(w41.begin(), w41.end(), std::ref(f16rng));
+  std::generate(w42.begin(), w42.end(), std::ref(f16rng));
+  std::generate(w43.begin(), w43.end(), std::ref(f16rng));
+  std::generate(w44.begin(), w44.end(), std::ref(f16rng));
+  std::generate(w45.begin(), w45.end(), std::ref(f16rng));
+  std::generate(w46.begin(), w46.end(), std::ref(f16rng));
+  std::generate(w47.begin(), w47.end(), std::ref(f16rng));
+  std::generate(w48.begin(), w48.end(), std::ref(f16rng));
+  std::generate(w49.begin(), w49.end(), std::ref(f16rng));
+  std::generate(w50.begin(), w50.end(), std::ref(f16rng));
+  std::generate(w51.begin(), w51.end(), std::ref(f16rng));
+  std::generate(w52.begin(), w52.end(), std::ref(f16rng));
+  std::generate(w53.begin(), w53.end(), std::ref(f16rng));
+  std::generate(w54.begin(), w54.end(), std::ref(f16rng));
+  std::generate(w55.begin(), w55.end(), std::ref(f16rng));
+  std::generate(w56.begin(), w56.end(), std::ref(f16rng));
+  std::generate(w57.begin(), w57.end(), std::ref(f16rng));
+  std::generate(w58.begin(), w58.end(), std::ref(f16rng));
+  std::generate(w59.begin(), w59.end(), std::ref(f16rng));
+  std::generate(w60.begin(), w60.end(), std::ref(f16rng));
+  std::generate(w61.begin(), w61.end(), std::ref(f16rng));
+  std::generate(w62.begin(), w62.end(), std::ref(f16rng));
+  std::generate(w63.begin(), w63.end(), std::ref(f16rng));
+  std::generate(w64.begin(), w64.end(), std::ref(f16rng));
+  std::generate(w65.begin(), w65.end(), std::ref(f16rng));
+  std::generate(w66.begin(), w66.end(), std::ref(f16rng));
+  std::generate(w67.begin(), w67.end(), std::ref(f16rng));
+  std::generate(w68.begin(), w68.end(), std::ref(f16rng));
+  std::generate(w69.begin(), w69.end(), std::ref(f16rng));
+  std::generate(w70.begin(), w70.end(), std::ref(f16rng));
+  std::generate(w71.begin(), w71.end(), std::ref(f16rng));
+  std::generate(w72.begin(), w72.end(), std::ref(f16rng));
+  std::generate(w73.begin(), w73.end(), std::ref(f16rng));
+  std::generate(w74.begin(), w74.end(), std::ref(f16rng));
+  std::generate(w75.begin(), w75.end(), std::ref(f16rng));
+  std::generate(w76.begin(), w76.end(), std::ref(f16rng));
+  std::generate(w77.begin(), w77.end(), std::ref(f16rng));
+  std::generate(w78.begin(), w78.end(), std::ref(f16rng));
+  std::generate(w79.begin(), w79.end(), std::ref(f16rng));
+  std::generate(w80.begin(), w80.end(), std::ref(f16rng));
+  std::generate(w81.begin(), w81.end(), std::ref(f16rng));
+  std::generate(w82.begin(), w82.end(), std::ref(f16rng));
+  std::generate(w83.begin(), w83.end(), std::ref(f16rng));
+  std::generate(w84.begin(), w84.end(), std::ref(f16rng));
+  std::generate(w85.begin(), w85.end(), std::ref(f16rng));
 
   ExecutionPlan operators;
   xnn_status status;
@@ -211,7 +212,7 @@
     32 /* output_channels_per_group */,
     3 /* input pixel stride */,
     32 /* output pixel stride */,
-    w30, w31,
+    w30.data(), w31.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op0);
@@ -233,7 +234,7 @@
     1 /* output_channels_per_group */,
     32 /* input pixel stride */,
     32 /* output pixel stride */,
-    w32, w33,
+    w32.data(), w33.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op1);
@@ -255,7 +256,7 @@
     64 /* output_channels_per_group */,
     32 /* input pixel stride */,
     64 /* output pixel stride */,
-    w34, w35,
+    w34.data(), w35.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op2);
@@ -277,7 +278,7 @@
     1 /* output_channels_per_group */,
     64 /* input pixel stride */,
     64 /* output pixel stride */,
-    w36, w37,
+    w36.data(), w37.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op3);
@@ -299,7 +300,7 @@
     128 /* output_channels_per_group */,
     64 /* input pixel stride */,
     128 /* output pixel stride */,
-    w38, w39,
+    w38.data(), w39.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op4);
@@ -321,7 +322,7 @@
     1 /* output_channels_per_group */,
     128 /* input pixel stride */,
     128 /* output pixel stride */,
-    w40, w41,
+    w40.data(), w41.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op5);
@@ -343,7 +344,7 @@
     128 /* output_channels_per_group */,
     128 /* input pixel stride */,
     128 /* output pixel stride */,
-    w42, w43,
+    w42.data(), w43.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op6);
@@ -365,7 +366,7 @@
     1 /* output_channels_per_group */,
     128 /* input pixel stride */,
     128 /* output pixel stride */,
-    w44, w45,
+    w44.data(), w45.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op7);
@@ -387,7 +388,7 @@
     256 /* output_channels_per_group */,
     128 /* input pixel stride */,
     256 /* output pixel stride */,
-    w46, w47,
+    w46.data(), w47.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op8);
@@ -409,7 +410,7 @@
     1 /* output_channels_per_group */,
     256 /* input pixel stride */,
     256 /* output pixel stride */,
-    w48, w49,
+    w48.data(), w49.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op9);
@@ -431,7 +432,7 @@
     256 /* output_channels_per_group */,
     256 /* input pixel stride */,
     256 /* output pixel stride */,
-    w50, w51,
+    w50.data(), w51.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op10);
@@ -453,7 +454,7 @@
     1 /* output_channels_per_group */,
     256 /* input pixel stride */,
     256 /* output pixel stride */,
-    w52, w53,
+    w52.data(), w53.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op11);
@@ -475,7 +476,7 @@
     512 /* output_channels_per_group */,
     256 /* input pixel stride */,
     512 /* output pixel stride */,
-    w54, w55,
+    w54.data(), w55.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op12);
@@ -497,7 +498,7 @@
     1 /* output_channels_per_group */,
     512 /* input pixel stride */,
     512 /* output pixel stride */,
-    w56, w57,
+    w56.data(), w57.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op13);
@@ -519,7 +520,7 @@
     512 /* output_channels_per_group */,
     512 /* input pixel stride */,
     512 /* output pixel stride */,
-    w58, w59,
+    w58.data(), w59.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op14);
@@ -541,7 +542,7 @@
     1 /* output_channels_per_group */,
     512 /* input pixel stride */,
     512 /* output pixel stride */,
-    w60, w61,
+    w60.data(), w61.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op15);
@@ -563,7 +564,7 @@
     512 /* output_channels_per_group */,
     512 /* input pixel stride */,
     512 /* output pixel stride */,
-    w62, w63,
+    w62.data(), w63.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op16);
@@ -585,7 +586,7 @@
     1 /* output_channels_per_group */,
     512 /* input pixel stride */,
     512 /* output pixel stride */,
-    w64, w65,
+    w64.data(), w65.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op17);
@@ -607,7 +608,7 @@
     512 /* output_channels_per_group */,
     512 /* input pixel stride */,
     512 /* output pixel stride */,
-    w66, w67,
+    w66.data(), w67.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op18);
@@ -629,7 +630,7 @@
     1 /* output_channels_per_group */,
     512 /* input pixel stride */,
     512 /* output pixel stride */,
-    w68, w69,
+    w68.data(), w69.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op19);
@@ -651,7 +652,7 @@
     512 /* output_channels_per_group */,
     512 /* input pixel stride */,
     512 /* output pixel stride */,
-    w70, w71,
+    w70.data(), w71.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op20);
@@ -673,7 +674,7 @@
     1 /* output_channels_per_group */,
     512 /* input pixel stride */,
     512 /* output pixel stride */,
-    w72, w73,
+    w72.data(), w73.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op21);
@@ -695,7 +696,7 @@
     512 /* output_channels_per_group */,
     512 /* input pixel stride */,
     512 /* output pixel stride */,
-    w74, w75,
+    w74.data(), w75.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op22);
@@ -717,7 +718,7 @@
     1 /* output_channels_per_group */,
     512 /* input pixel stride */,
     512 /* output pixel stride */,
-    w76, w77,
+    w76.data(), w77.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op23);
@@ -739,7 +740,7 @@
     1024 /* output_channels_per_group */,
     512 /* input pixel stride */,
     1024 /* output pixel stride */,
-    w78, w79,
+    w78.data(), w79.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op24);
@@ -761,7 +762,7 @@
     1 /* output_channels_per_group */,
     1024 /* input pixel stride */,
     1024 /* output pixel stride */,
-    w80, w81,
+    w80.data(), w81.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op25);
@@ -783,7 +784,7 @@
     1024 /* output_channels_per_group */,
     1024 /* input pixel stride */,
     1024 /* output pixel stride */,
-    w82, w83,
+    w82.data(), w83.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op26);
@@ -817,7 +818,7 @@
     1001 /* output_channels_per_group */,
     1024 /* input pixel stride */,
     1001 /* output pixel stride */,
-    w84, w85,
+    w84.data(), w85.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op28);
@@ -832,7 +833,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op0,
     1 /* batch size */, 224 /* input height */, 224 /* input width */,
-    v0 /* input */, v1 /* output */,
+    v0.data() /* input */, v1.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #0" << std::endl;
@@ -842,7 +843,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op1,
     1 /* batch size */, 112 /* input height */, 112 /* input width */,
-    v1 /* input */, v2 /* output */,
+    v1.data() /* input */, v2.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #1" << std::endl;
@@ -852,7 +853,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op2,
     1 /* batch size */, 112 /* input height */, 112 /* input width */,
-    v2 /* input */, v3 /* output */,
+    v2.data() /* input */, v3.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #2" << std::endl;
@@ -862,7 +863,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op3,
     1 /* batch size */, 112 /* input height */, 112 /* input width */,
-    v3 /* input */, v4 /* output */,
+    v3.data() /* input */, v4.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #3" << std::endl;
@@ -872,7 +873,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op4,
     1 /* batch size */, 56 /* input height */, 56 /* input width */,
-    v4 /* input */, v5 /* output */,
+    v4.data() /* input */, v5.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #4" << std::endl;
@@ -882,7 +883,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op5,
     1 /* batch size */, 56 /* input height */, 56 /* input width */,
-    v5 /* input */, v6 /* output */,
+    v5.data() /* input */, v6.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #5" << std::endl;
@@ -892,7 +893,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op6,
     1 /* batch size */, 56 /* input height */, 56 /* input width */,
-    v6 /* input */, v7 /* output */,
+    v6.data() /* input */, v7.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #6" << std::endl;
@@ -902,7 +903,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op7,
     1 /* batch size */, 56 /* input height */, 56 /* input width */,
-    v7 /* input */, v8 /* output */,
+    v7.data() /* input */, v8.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #7" << std::endl;
@@ -912,7 +913,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op8,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v8 /* input */, v9 /* output */,
+    v8.data() /* input */, v9.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #8" << std::endl;
@@ -922,7 +923,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op9,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v9 /* input */, v10 /* output */,
+    v9.data() /* input */, v10.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #9" << std::endl;
@@ -932,7 +933,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op10,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v10 /* input */, v11 /* output */,
+    v10.data() /* input */, v11.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #10" << std::endl;
@@ -942,7 +943,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op11,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v11 /* input */, v12 /* output */,
+    v11.data() /* input */, v12.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #11" << std::endl;
@@ -952,7 +953,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op12,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v12 /* input */, v13 /* output */,
+    v12.data() /* input */, v13.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #12" << std::endl;
@@ -962,7 +963,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op13,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v13 /* input */, v14 /* output */,
+    v13.data() /* input */, v14.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #13" << std::endl;
@@ -972,7 +973,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op14,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v14 /* input */, v15 /* output */,
+    v14.data() /* input */, v15.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #14" << std::endl;
@@ -982,7 +983,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op15,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v15 /* input */, v16 /* output */,
+    v15.data() /* input */, v16.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #15" << std::endl;
@@ -992,7 +993,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op16,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v16 /* input */, v17 /* output */,
+    v16.data() /* input */, v17.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #16" << std::endl;
@@ -1002,7 +1003,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op17,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v17 /* input */, v18 /* output */,
+    v17.data() /* input */, v18.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #17" << std::endl;
@@ -1012,7 +1013,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op18,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v18 /* input */, v19 /* output */,
+    v18.data() /* input */, v19.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #18" << std::endl;
@@ -1022,7 +1023,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op19,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v19 /* input */, v20 /* output */,
+    v19.data() /* input */, v20.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #19" << std::endl;
@@ -1032,7 +1033,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op20,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v20 /* input */, v21 /* output */,
+    v20.data() /* input */, v21.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #20" << std::endl;
@@ -1042,7 +1043,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op21,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v21 /* input */, v22 /* output */,
+    v21.data() /* input */, v22.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #21" << std::endl;
@@ -1052,7 +1053,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op22,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v22 /* input */, v23 /* output */,
+    v22.data() /* input */, v23.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #22" << std::endl;
@@ -1062,7 +1063,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op23,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v23 /* input */, v24 /* output */,
+    v23.data() /* input */, v24.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #23" << std::endl;
@@ -1072,7 +1073,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op24,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v24 /* input */, v25 /* output */,
+    v24.data() /* input */, v25.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #24" << std::endl;
@@ -1082,7 +1083,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op25,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v25 /* input */, v26 /* output */,
+    v25.data() /* input */, v26.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #25" << std::endl;
@@ -1092,7 +1093,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op26,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v26 /* input */, v27 /* output */,
+    v26.data() /* input */, v27.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #26" << std::endl;
@@ -1102,7 +1103,7 @@
   status = xnn_setup_global_average_pooling_nwc_f16(
     op27,
     1 /* batch size */, 49 /* width */,
-    v27 /* input */, v28 /* output */,
+    v27.data() /* input */, v28.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #27" << std::endl;
@@ -1112,7 +1113,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op28,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v28 /* input */, v29 /* output */,
+    v28.data() /* input */, v29.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #28" << std::endl;
diff --git a/models/fp16-mobilenet-v2.cc b/models/fp16-mobilenet-v2.cc
index ab18a39..54f4382 100644
--- a/models/fp16-mobilenet-v2.cc
+++ b/models/fp16-mobilenet-v2.cc
@@ -5,6 +5,7 @@
 
 #include <xnnpack.h>
 
+#include <array>
 #include <algorithm>
 #include <functional>
 #include <iostream>
@@ -18,353 +19,353 @@
 namespace models {
 
 ExecutionPlan FP16MobileNetV2(pthreadpool_t threadpool) {
-  alignas(16) static uint16_t v0[150528];
-  alignas(16) static uint16_t v1[401408];
-  alignas(16) static uint16_t v2[401408];
-  alignas(16) static uint16_t v3[200704];
-  alignas(16) static uint16_t v4[1204224];
-  alignas(16) static uint16_t v5[301056];
-  alignas(16) static uint16_t v6[75264];
-  alignas(16) static uint16_t v7[451584];
-  alignas(16) static uint16_t v8[451584];
-  alignas(16) static uint16_t v9[75264];
-  alignas(16) static uint16_t v10[75264];
-  alignas(16) static uint16_t v11[451584];
-  alignas(16) static uint16_t v12[112896];
-  alignas(16) static uint16_t v13[25088];
-  alignas(16) static uint16_t v14[150528];
-  alignas(16) static uint16_t v15[150528];
-  alignas(16) static uint16_t v16[25088];
-  alignas(16) static uint16_t v17[25088];
-  alignas(16) static uint16_t v18[150528];
-  alignas(16) static uint16_t v19[150528];
-  alignas(16) static uint16_t v20[25088];
-  alignas(16) static uint16_t v21[25088];
-  alignas(16) static uint16_t v22[150528];
-  alignas(16) static uint16_t v23[37632];
-  alignas(16) static uint16_t v24[12544];
-  alignas(16) static uint16_t v25[75264];
-  alignas(16) static uint16_t v26[75264];
-  alignas(16) static uint16_t v27[12544];
-  alignas(16) static uint16_t v28[12544];
-  alignas(16) static uint16_t v29[75264];
-  alignas(16) static uint16_t v30[75264];
-  alignas(16) static uint16_t v31[12544];
-  alignas(16) static uint16_t v32[12544];
-  alignas(16) static uint16_t v33[75264];
-  alignas(16) static uint16_t v34[75264];
-  alignas(16) static uint16_t v35[12544];
-  alignas(16) static uint16_t v36[12544];
-  alignas(16) static uint16_t v37[75264];
-  alignas(16) static uint16_t v38[75264];
-  alignas(16) static uint16_t v39[18816];
-  alignas(16) static uint16_t v40[112896];
-  alignas(16) static uint16_t v41[112896];
-  alignas(16) static uint16_t v42[18816];
-  alignas(16) static uint16_t v43[18816];
-  alignas(16) static uint16_t v44[112896];
-  alignas(16) static uint16_t v45[112896];
-  alignas(16) static uint16_t v46[18816];
-  alignas(16) static uint16_t v47[18816];
-  alignas(16) static uint16_t v48[112896];
-  alignas(16) static uint16_t v49[28224];
-  alignas(16) static uint16_t v50[7840];
-  alignas(16) static uint16_t v51[47040];
-  alignas(16) static uint16_t v52[47040];
-  alignas(16) static uint16_t v53[7840];
-  alignas(16) static uint16_t v54[7840];
-  alignas(16) static uint16_t v55[47040];
-  alignas(16) static uint16_t v56[47040];
-  alignas(16) static uint16_t v57[7840];
-  alignas(16) static uint16_t v58[7840];
-  alignas(16) static uint16_t v59[47040];
-  alignas(16) static uint16_t v60[47040];
-  alignas(16) static uint16_t v61[15680];
-  alignas(16) static uint16_t v62[62720];
-  alignas(16) static uint16_t v63[1280];
-  alignas(16) static uint16_t v64[1001];
-  alignas(16) static uint16_t w65[864];
-  alignas(16) static uint16_t w66[32];
-  alignas(16) static uint16_t w67[288];
-  alignas(16) static uint16_t w68[32];
-  alignas(16) static uint16_t w69[512];
-  alignas(16) static uint16_t w70[16];
-  alignas(16) static uint16_t w71[1536];
-  alignas(16) static uint16_t w72[96];
-  alignas(16) static uint16_t w73[864];
-  alignas(16) static uint16_t w74[96];
-  alignas(16) static uint16_t w75[2304];
-  alignas(16) static uint16_t w76[24];
-  alignas(16) static uint16_t w77[3456];
-  alignas(16) static uint16_t w78[144];
-  alignas(16) static uint16_t w79[1296];
-  alignas(16) static uint16_t w80[144];
-  alignas(16) static uint16_t w81[3456];
-  alignas(16) static uint16_t w82[24];
-  alignas(16) static uint16_t w83[3456];
-  alignas(16) static uint16_t w84[144];
-  alignas(16) static uint16_t w85[1296];
-  alignas(16) static uint16_t w86[144];
-  alignas(16) static uint16_t w87[4608];
-  alignas(16) static uint16_t w88[32];
-  alignas(16) static uint16_t w89[6144];
-  alignas(16) static uint16_t w90[192];
-  alignas(16) static uint16_t w91[1728];
-  alignas(16) static uint16_t w92[192];
-  alignas(16) static uint16_t w93[6144];
-  alignas(16) static uint16_t w94[32];
-  alignas(16) static uint16_t w95[6144];
-  alignas(16) static uint16_t w96[192];
-  alignas(16) static uint16_t w97[1728];
-  alignas(16) static uint16_t w98[192];
-  alignas(16) static uint16_t w99[6144];
-  alignas(16) static uint16_t w100[32];
-  alignas(16) static uint16_t w101[6144];
-  alignas(16) static uint16_t w102[192];
-  alignas(16) static uint16_t w103[1728];
-  alignas(16) static uint16_t w104[192];
-  alignas(16) static uint16_t w105[12288];
-  alignas(16) static uint16_t w106[64];
-  alignas(16) static uint16_t w107[24576];
-  alignas(16) static uint16_t w108[384];
-  alignas(16) static uint16_t w109[3456];
-  alignas(16) static uint16_t w110[384];
-  alignas(16) static uint16_t w111[24576];
-  alignas(16) static uint16_t w112[64];
-  alignas(16) static uint16_t w113[24576];
-  alignas(16) static uint16_t w114[384];
-  alignas(16) static uint16_t w115[3456];
-  alignas(16) static uint16_t w116[384];
-  alignas(16) static uint16_t w117[24576];
-  alignas(16) static uint16_t w118[64];
-  alignas(16) static uint16_t w119[24576];
-  alignas(16) static uint16_t w120[384];
-  alignas(16) static uint16_t w121[3456];
-  alignas(16) static uint16_t w122[384];
-  alignas(16) static uint16_t w123[24576];
-  alignas(16) static uint16_t w124[64];
-  alignas(16) static uint16_t w125[24576];
-  alignas(16) static uint16_t w126[384];
-  alignas(16) static uint16_t w127[3456];
-  alignas(16) static uint16_t w128[384];
-  alignas(16) static uint16_t w129[36864];
-  alignas(16) static uint16_t w130[96];
-  alignas(16) static uint16_t w131[55296];
-  alignas(16) static uint16_t w132[576];
-  alignas(16) static uint16_t w133[5184];
-  alignas(16) static uint16_t w134[576];
-  alignas(16) static uint16_t w135[55296];
-  alignas(16) static uint16_t w136[96];
-  alignas(16) static uint16_t w137[55296];
-  alignas(16) static uint16_t w138[576];
-  alignas(16) static uint16_t w139[5184];
-  alignas(16) static uint16_t w140[576];
-  alignas(16) static uint16_t w141[55296];
-  alignas(16) static uint16_t w142[96];
-  alignas(16) static uint16_t w143[55296];
-  alignas(16) static uint16_t w144[576];
-  alignas(16) static uint16_t w145[5184];
-  alignas(16) static uint16_t w146[576];
-  alignas(16) static uint16_t w147[92160];
-  alignas(16) static uint16_t w148[160];
-  alignas(16) static uint16_t w149[153600];
-  alignas(16) static uint16_t w150[960];
-  alignas(16) static uint16_t w151[8640];
-  alignas(16) static uint16_t w152[960];
-  alignas(16) static uint16_t w153[153600];
-  alignas(16) static uint16_t w154[160];
-  alignas(16) static uint16_t w155[153600];
-  alignas(16) static uint16_t w156[960];
-  alignas(16) static uint16_t w157[8640];
-  alignas(16) static uint16_t w158[960];
-  alignas(16) static uint16_t w159[153600];
-  alignas(16) static uint16_t w160[160];
-  alignas(16) static uint16_t w161[153600];
-  alignas(16) static uint16_t w162[960];
-  alignas(16) static uint16_t w163[8640];
-  alignas(16) static uint16_t w164[960];
-  alignas(16) static uint16_t w165[307200];
-  alignas(16) static uint16_t w166[320];
-  alignas(16) static uint16_t w167[409600];
-  alignas(16) static uint16_t w168[1280];
-  alignas(16) static uint16_t w169[1281280];
-  alignas(16) static uint16_t w170[1001];
+  alignas(16) static std::array<uint16_t, 150528> v0;
+  alignas(16) static std::array<uint16_t, 401408> v1;
+  alignas(16) static std::array<uint16_t, 401408> v2;
+  alignas(16) static std::array<uint16_t, 200704> v3;
+  alignas(16) static std::array<uint16_t, 1204224> v4;
+  alignas(16) static std::array<uint16_t, 301056> v5;
+  alignas(16) static std::array<uint16_t, 75264> v6;
+  alignas(16) static std::array<uint16_t, 451584> v7;
+  alignas(16) static std::array<uint16_t, 451584> v8;
+  alignas(16) static std::array<uint16_t, 75264> v9;
+  alignas(16) static std::array<uint16_t, 75264> v10;
+  alignas(16) static std::array<uint16_t, 451584> v11;
+  alignas(16) static std::array<uint16_t, 112896> v12;
+  alignas(16) static std::array<uint16_t, 25088> v13;
+  alignas(16) static std::array<uint16_t, 150528> v14;
+  alignas(16) static std::array<uint16_t, 150528> v15;
+  alignas(16) static std::array<uint16_t, 25088> v16;
+  alignas(16) static std::array<uint16_t, 25088> v17;
+  alignas(16) static std::array<uint16_t, 150528> v18;
+  alignas(16) static std::array<uint16_t, 150528> v19;
+  alignas(16) static std::array<uint16_t, 25088> v20;
+  alignas(16) static std::array<uint16_t, 25088> v21;
+  alignas(16) static std::array<uint16_t, 150528> v22;
+  alignas(16) static std::array<uint16_t, 37632> v23;
+  alignas(16) static std::array<uint16_t, 12544> v24;
+  alignas(16) static std::array<uint16_t, 75264> v25;
+  alignas(16) static std::array<uint16_t, 75264> v26;
+  alignas(16) static std::array<uint16_t, 12544> v27;
+  alignas(16) static std::array<uint16_t, 12544> v28;
+  alignas(16) static std::array<uint16_t, 75264> v29;
+  alignas(16) static std::array<uint16_t, 75264> v30;
+  alignas(16) static std::array<uint16_t, 12544> v31;
+  alignas(16) static std::array<uint16_t, 12544> v32;
+  alignas(16) static std::array<uint16_t, 75264> v33;
+  alignas(16) static std::array<uint16_t, 75264> v34;
+  alignas(16) static std::array<uint16_t, 12544> v35;
+  alignas(16) static std::array<uint16_t, 12544> v36;
+  alignas(16) static std::array<uint16_t, 75264> v37;
+  alignas(16) static std::array<uint16_t, 75264> v38;
+  alignas(16) static std::array<uint16_t, 18816> v39;
+  alignas(16) static std::array<uint16_t, 112896> v40;
+  alignas(16) static std::array<uint16_t, 112896> v41;
+  alignas(16) static std::array<uint16_t, 18816> v42;
+  alignas(16) static std::array<uint16_t, 18816> v43;
+  alignas(16) static std::array<uint16_t, 112896> v44;
+  alignas(16) static std::array<uint16_t, 112896> v45;
+  alignas(16) static std::array<uint16_t, 18816> v46;
+  alignas(16) static std::array<uint16_t, 18816> v47;
+  alignas(16) static std::array<uint16_t, 112896> v48;
+  alignas(16) static std::array<uint16_t, 28224> v49;
+  alignas(16) static std::array<uint16_t, 7840> v50;
+  alignas(16) static std::array<uint16_t, 47040> v51;
+  alignas(16) static std::array<uint16_t, 47040> v52;
+  alignas(16) static std::array<uint16_t, 7840> v53;
+  alignas(16) static std::array<uint16_t, 7840> v54;
+  alignas(16) static std::array<uint16_t, 47040> v55;
+  alignas(16) static std::array<uint16_t, 47040> v56;
+  alignas(16) static std::array<uint16_t, 7840> v57;
+  alignas(16) static std::array<uint16_t, 7840> v58;
+  alignas(16) static std::array<uint16_t, 47040> v59;
+  alignas(16) static std::array<uint16_t, 47040> v60;
+  alignas(16) static std::array<uint16_t, 15680> v61;
+  alignas(16) static std::array<uint16_t, 62720> v62;
+  alignas(16) static std::array<uint16_t, 1280> v63;
+  alignas(16) static std::array<uint16_t, 1001> v64;
+  alignas(16) static std::array<uint16_t, 864> w65;
+  alignas(16) static std::array<uint16_t, 32> w66;
+  alignas(16) static std::array<uint16_t, 288> w67;
+  alignas(16) static std::array<uint16_t, 32> w68;
+  alignas(16) static std::array<uint16_t, 512> w69;
+  alignas(16) static std::array<uint16_t, 16> w70;
+  alignas(16) static std::array<uint16_t, 1536> w71;
+  alignas(16) static std::array<uint16_t, 96> w72;
+  alignas(16) static std::array<uint16_t, 864> w73;
+  alignas(16) static std::array<uint16_t, 96> w74;
+  alignas(16) static std::array<uint16_t, 2304> w75;
+  alignas(16) static std::array<uint16_t, 24> w76;
+  alignas(16) static std::array<uint16_t, 3456> w77;
+  alignas(16) static std::array<uint16_t, 144> w78;
+  alignas(16) static std::array<uint16_t, 1296> w79;
+  alignas(16) static std::array<uint16_t, 144> w80;
+  alignas(16) static std::array<uint16_t, 3456> w81;
+  alignas(16) static std::array<uint16_t, 24> w82;
+  alignas(16) static std::array<uint16_t, 3456> w83;
+  alignas(16) static std::array<uint16_t, 144> w84;
+  alignas(16) static std::array<uint16_t, 1296> w85;
+  alignas(16) static std::array<uint16_t, 144> w86;
+  alignas(16) static std::array<uint16_t, 4608> w87;
+  alignas(16) static std::array<uint16_t, 32> w88;
+  alignas(16) static std::array<uint16_t, 6144> w89;
+  alignas(16) static std::array<uint16_t, 192> w90;
+  alignas(16) static std::array<uint16_t, 1728> w91;
+  alignas(16) static std::array<uint16_t, 192> w92;
+  alignas(16) static std::array<uint16_t, 6144> w93;
+  alignas(16) static std::array<uint16_t, 32> w94;
+  alignas(16) static std::array<uint16_t, 6144> w95;
+  alignas(16) static std::array<uint16_t, 192> w96;
+  alignas(16) static std::array<uint16_t, 1728> w97;
+  alignas(16) static std::array<uint16_t, 192> w98;
+  alignas(16) static std::array<uint16_t, 6144> w99;
+  alignas(16) static std::array<uint16_t, 32> w100;
+  alignas(16) static std::array<uint16_t, 6144> w101;
+  alignas(16) static std::array<uint16_t, 192> w102;
+  alignas(16) static std::array<uint16_t, 1728> w103;
+  alignas(16) static std::array<uint16_t, 192> w104;
+  alignas(16) static std::array<uint16_t, 12288> w105;
+  alignas(16) static std::array<uint16_t, 64> w106;
+  alignas(16) static std::array<uint16_t, 24576> w107;
+  alignas(16) static std::array<uint16_t, 384> w108;
+  alignas(16) static std::array<uint16_t, 3456> w109;
+  alignas(16) static std::array<uint16_t, 384> w110;
+  alignas(16) static std::array<uint16_t, 24576> w111;
+  alignas(16) static std::array<uint16_t, 64> w112;
+  alignas(16) static std::array<uint16_t, 24576> w113;
+  alignas(16) static std::array<uint16_t, 384> w114;
+  alignas(16) static std::array<uint16_t, 3456> w115;
+  alignas(16) static std::array<uint16_t, 384> w116;
+  alignas(16) static std::array<uint16_t, 24576> w117;
+  alignas(16) static std::array<uint16_t, 64> w118;
+  alignas(16) static std::array<uint16_t, 24576> w119;
+  alignas(16) static std::array<uint16_t, 384> w120;
+  alignas(16) static std::array<uint16_t, 3456> w121;
+  alignas(16) static std::array<uint16_t, 384> w122;
+  alignas(16) static std::array<uint16_t, 24576> w123;
+  alignas(16) static std::array<uint16_t, 64> w124;
+  alignas(16) static std::array<uint16_t, 24576> w125;
+  alignas(16) static std::array<uint16_t, 384> w126;
+  alignas(16) static std::array<uint16_t, 3456> w127;
+  alignas(16) static std::array<uint16_t, 384> w128;
+  alignas(16) static std::array<uint16_t, 36864> w129;
+  alignas(16) static std::array<uint16_t, 96> w130;
+  alignas(16) static std::array<uint16_t, 55296> w131;
+  alignas(16) static std::array<uint16_t, 576> w132;
+  alignas(16) static std::array<uint16_t, 5184> w133;
+  alignas(16) static std::array<uint16_t, 576> w134;
+  alignas(16) static std::array<uint16_t, 55296> w135;
+  alignas(16) static std::array<uint16_t, 96> w136;
+  alignas(16) static std::array<uint16_t, 55296> w137;
+  alignas(16) static std::array<uint16_t, 576> w138;
+  alignas(16) static std::array<uint16_t, 5184> w139;
+  alignas(16) static std::array<uint16_t, 576> w140;
+  alignas(16) static std::array<uint16_t, 55296> w141;
+  alignas(16) static std::array<uint16_t, 96> w142;
+  alignas(16) static std::array<uint16_t, 55296> w143;
+  alignas(16) static std::array<uint16_t, 576> w144;
+  alignas(16) static std::array<uint16_t, 5184> w145;
+  alignas(16) static std::array<uint16_t, 576> w146;
+  alignas(16) static std::array<uint16_t, 92160> w147;
+  alignas(16) static std::array<uint16_t, 160> w148;
+  alignas(16) static std::array<uint16_t, 153600> w149;
+  alignas(16) static std::array<uint16_t, 960> w150;
+  alignas(16) static std::array<uint16_t, 8640> w151;
+  alignas(16) static std::array<uint16_t, 960> w152;
+  alignas(16) static std::array<uint16_t, 153600> w153;
+  alignas(16) static std::array<uint16_t, 160> w154;
+  alignas(16) static std::array<uint16_t, 153600> w155;
+  alignas(16) static std::array<uint16_t, 960> w156;
+  alignas(16) static std::array<uint16_t, 8640> w157;
+  alignas(16) static std::array<uint16_t, 960> w158;
+  alignas(16) static std::array<uint16_t, 153600> w159;
+  alignas(16) static std::array<uint16_t, 160> w160;
+  alignas(16) static std::array<uint16_t, 153600> w161;
+  alignas(16) static std::array<uint16_t, 960> w162;
+  alignas(16) static std::array<uint16_t, 8640> w163;
+  alignas(16) static std::array<uint16_t, 960> w164;
+  alignas(16) static std::array<uint16_t, 307200> w165;
+  alignas(16) static std::array<uint16_t, 320> w166;
+  alignas(16) static std::array<uint16_t, 409600> w167;
+  alignas(16) static std::array<uint16_t, 1280> w168;
+  alignas(16) static std::array<uint16_t, 1281280> w169;
+  alignas(16) static std::array<uint16_t, 1001> w170;
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.0f, +1.0f), std::ref(rng));
   auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
-  std::generate(v0, v0 + 150528, std::ref(f16rng));
-  std::generate(v1, v1 + 401408, std::ref(f16rng));
-  std::generate(v2, v2 + 401408, std::ref(f16rng));
-  std::generate(v3, v3 + 200704, std::ref(f16rng));
-  std::generate(v4, v4 + 1204224, std::ref(f16rng));
-  std::generate(v5, v5 + 301056, std::ref(f16rng));
-  std::generate(v6, v6 + 75264, std::ref(f16rng));
-  std::generate(v7, v7 + 451584, std::ref(f16rng));
-  std::generate(v8, v8 + 451584, std::ref(f16rng));
-  std::generate(v9, v9 + 75264, std::ref(f16rng));
-  std::generate(v10, v10 + 75264, std::ref(f16rng));
-  std::generate(v11, v11 + 451584, std::ref(f16rng));
-  std::generate(v12, v12 + 112896, std::ref(f16rng));
-  std::generate(v13, v13 + 25088, std::ref(f16rng));
-  std::generate(v14, v14 + 150528, std::ref(f16rng));
-  std::generate(v15, v15 + 150528, std::ref(f16rng));
-  std::generate(v16, v16 + 25088, std::ref(f16rng));
-  std::generate(v17, v17 + 25088, std::ref(f16rng));
-  std::generate(v18, v18 + 150528, std::ref(f16rng));
-  std::generate(v19, v19 + 150528, std::ref(f16rng));
-  std::generate(v20, v20 + 25088, std::ref(f16rng));
-  std::generate(v21, v21 + 25088, std::ref(f16rng));
-  std::generate(v22, v22 + 150528, std::ref(f16rng));
-  std::generate(v23, v23 + 37632, std::ref(f16rng));
-  std::generate(v24, v24 + 12544, std::ref(f16rng));
-  std::generate(v25, v25 + 75264, std::ref(f16rng));
-  std::generate(v26, v26 + 75264, std::ref(f16rng));
-  std::generate(v27, v27 + 12544, std::ref(f16rng));
-  std::generate(v28, v28 + 12544, std::ref(f16rng));
-  std::generate(v29, v29 + 75264, std::ref(f16rng));
-  std::generate(v30, v30 + 75264, std::ref(f16rng));
-  std::generate(v31, v31 + 12544, std::ref(f16rng));
-  std::generate(v32, v32 + 12544, std::ref(f16rng));
-  std::generate(v33, v33 + 75264, std::ref(f16rng));
-  std::generate(v34, v34 + 75264, std::ref(f16rng));
-  std::generate(v35, v35 + 12544, std::ref(f16rng));
-  std::generate(v36, v36 + 12544, std::ref(f16rng));
-  std::generate(v37, v37 + 75264, std::ref(f16rng));
-  std::generate(v38, v38 + 75264, std::ref(f16rng));
-  std::generate(v39, v39 + 18816, std::ref(f16rng));
-  std::generate(v40, v40 + 112896, std::ref(f16rng));
-  std::generate(v41, v41 + 112896, std::ref(f16rng));
-  std::generate(v42, v42 + 18816, std::ref(f16rng));
-  std::generate(v43, v43 + 18816, std::ref(f16rng));
-  std::generate(v44, v44 + 112896, std::ref(f16rng));
-  std::generate(v45, v45 + 112896, std::ref(f16rng));
-  std::generate(v46, v46 + 18816, std::ref(f16rng));
-  std::generate(v47, v47 + 18816, std::ref(f16rng));
-  std::generate(v48, v48 + 112896, std::ref(f16rng));
-  std::generate(v49, v49 + 28224, std::ref(f16rng));
-  std::generate(v50, v50 + 7840, std::ref(f16rng));
-  std::generate(v51, v51 + 47040, std::ref(f16rng));
-  std::generate(v52, v52 + 47040, std::ref(f16rng));
-  std::generate(v53, v53 + 7840, std::ref(f16rng));
-  std::generate(v54, v54 + 7840, std::ref(f16rng));
-  std::generate(v55, v55 + 47040, std::ref(f16rng));
-  std::generate(v56, v56 + 47040, std::ref(f16rng));
-  std::generate(v57, v57 + 7840, std::ref(f16rng));
-  std::generate(v58, v58 + 7840, std::ref(f16rng));
-  std::generate(v59, v59 + 47040, std::ref(f16rng));
-  std::generate(v60, v60 + 47040, std::ref(f16rng));
-  std::generate(v61, v61 + 15680, std::ref(f16rng));
-  std::generate(v62, v62 + 62720, std::ref(f16rng));
-  std::generate(v63, v63 + 1280, std::ref(f16rng));
-  std::generate(v64, v64 + 1001, std::ref(f16rng));
-  std::generate(w65, w65 + 864, std::ref(f16rng));
-  std::generate(w66, w66 + 32, std::ref(f16rng));
-  std::generate(w67, w67 + 288, std::ref(f16rng));
-  std::generate(w68, w68 + 32, std::ref(f16rng));
-  std::generate(w69, w69 + 512, std::ref(f16rng));
-  std::generate(w70, w70 + 16, std::ref(f16rng));
-  std::generate(w71, w71 + 1536, std::ref(f16rng));
-  std::generate(w72, w72 + 96, std::ref(f16rng));
-  std::generate(w73, w73 + 864, std::ref(f16rng));
-  std::generate(w74, w74 + 96, std::ref(f16rng));
-  std::generate(w75, w75 + 2304, std::ref(f16rng));
-  std::generate(w76, w76 + 24, std::ref(f16rng));
-  std::generate(w77, w77 + 3456, std::ref(f16rng));
-  std::generate(w78, w78 + 144, std::ref(f16rng));
-  std::generate(w79, w79 + 1296, std::ref(f16rng));
-  std::generate(w80, w80 + 144, std::ref(f16rng));
-  std::generate(w81, w81 + 3456, std::ref(f16rng));
-  std::generate(w82, w82 + 24, std::ref(f16rng));
-  std::generate(w83, w83 + 3456, std::ref(f16rng));
-  std::generate(w84, w84 + 144, std::ref(f16rng));
-  std::generate(w85, w85 + 1296, std::ref(f16rng));
-  std::generate(w86, w86 + 144, std::ref(f16rng));
-  std::generate(w87, w87 + 4608, std::ref(f16rng));
-  std::generate(w88, w88 + 32, std::ref(f16rng));
-  std::generate(w89, w89 + 6144, std::ref(f16rng));
-  std::generate(w90, w90 + 192, std::ref(f16rng));
-  std::generate(w91, w91 + 1728, std::ref(f16rng));
-  std::generate(w92, w92 + 192, std::ref(f16rng));
-  std::generate(w93, w93 + 6144, std::ref(f16rng));
-  std::generate(w94, w94 + 32, std::ref(f16rng));
-  std::generate(w95, w95 + 6144, std::ref(f16rng));
-  std::generate(w96, w96 + 192, std::ref(f16rng));
-  std::generate(w97, w97 + 1728, std::ref(f16rng));
-  std::generate(w98, w98 + 192, std::ref(f16rng));
-  std::generate(w99, w99 + 6144, std::ref(f16rng));
-  std::generate(w100, w100 + 32, std::ref(f16rng));
-  std::generate(w101, w101 + 6144, std::ref(f16rng));
-  std::generate(w102, w102 + 192, std::ref(f16rng));
-  std::generate(w103, w103 + 1728, std::ref(f16rng));
-  std::generate(w104, w104 + 192, std::ref(f16rng));
-  std::generate(w105, w105 + 12288, std::ref(f16rng));
-  std::generate(w106, w106 + 64, std::ref(f16rng));
-  std::generate(w107, w107 + 24576, std::ref(f16rng));
-  std::generate(w108, w108 + 384, std::ref(f16rng));
-  std::generate(w109, w109 + 3456, std::ref(f16rng));
-  std::generate(w110, w110 + 384, std::ref(f16rng));
-  std::generate(w111, w111 + 24576, std::ref(f16rng));
-  std::generate(w112, w112 + 64, std::ref(f16rng));
-  std::generate(w113, w113 + 24576, std::ref(f16rng));
-  std::generate(w114, w114 + 384, std::ref(f16rng));
-  std::generate(w115, w115 + 3456, std::ref(f16rng));
-  std::generate(w116, w116 + 384, std::ref(f16rng));
-  std::generate(w117, w117 + 24576, std::ref(f16rng));
-  std::generate(w118, w118 + 64, std::ref(f16rng));
-  std::generate(w119, w119 + 24576, std::ref(f16rng));
-  std::generate(w120, w120 + 384, std::ref(f16rng));
-  std::generate(w121, w121 + 3456, std::ref(f16rng));
-  std::generate(w122, w122 + 384, std::ref(f16rng));
-  std::generate(w123, w123 + 24576, std::ref(f16rng));
-  std::generate(w124, w124 + 64, std::ref(f16rng));
-  std::generate(w125, w125 + 24576, std::ref(f16rng));
-  std::generate(w126, w126 + 384, std::ref(f16rng));
-  std::generate(w127, w127 + 3456, std::ref(f16rng));
-  std::generate(w128, w128 + 384, std::ref(f16rng));
-  std::generate(w129, w129 + 36864, std::ref(f16rng));
-  std::generate(w130, w130 + 96, std::ref(f16rng));
-  std::generate(w131, w131 + 55296, std::ref(f16rng));
-  std::generate(w132, w132 + 576, std::ref(f16rng));
-  std::generate(w133, w133 + 5184, std::ref(f16rng));
-  std::generate(w134, w134 + 576, std::ref(f16rng));
-  std::generate(w135, w135 + 55296, std::ref(f16rng));
-  std::generate(w136, w136 + 96, std::ref(f16rng));
-  std::generate(w137, w137 + 55296, std::ref(f16rng));
-  std::generate(w138, w138 + 576, std::ref(f16rng));
-  std::generate(w139, w139 + 5184, std::ref(f16rng));
-  std::generate(w140, w140 + 576, std::ref(f16rng));
-  std::generate(w141, w141 + 55296, std::ref(f16rng));
-  std::generate(w142, w142 + 96, std::ref(f16rng));
-  std::generate(w143, w143 + 55296, std::ref(f16rng));
-  std::generate(w144, w144 + 576, std::ref(f16rng));
-  std::generate(w145, w145 + 5184, std::ref(f16rng));
-  std::generate(w146, w146 + 576, std::ref(f16rng));
-  std::generate(w147, w147 + 92160, std::ref(f16rng));
-  std::generate(w148, w148 + 160, std::ref(f16rng));
-  std::generate(w149, w149 + 153600, std::ref(f16rng));
-  std::generate(w150, w150 + 960, std::ref(f16rng));
-  std::generate(w151, w151 + 8640, std::ref(f16rng));
-  std::generate(w152, w152 + 960, std::ref(f16rng));
-  std::generate(w153, w153 + 153600, std::ref(f16rng));
-  std::generate(w154, w154 + 160, std::ref(f16rng));
-  std::generate(w155, w155 + 153600, std::ref(f16rng));
-  std::generate(w156, w156 + 960, std::ref(f16rng));
-  std::generate(w157, w157 + 8640, std::ref(f16rng));
-  std::generate(w158, w158 + 960, std::ref(f16rng));
-  std::generate(w159, w159 + 153600, std::ref(f16rng));
-  std::generate(w160, w160 + 160, std::ref(f16rng));
-  std::generate(w161, w161 + 153600, std::ref(f16rng));
-  std::generate(w162, w162 + 960, std::ref(f16rng));
-  std::generate(w163, w163 + 8640, std::ref(f16rng));
-  std::generate(w164, w164 + 960, std::ref(f16rng));
-  std::generate(w165, w165 + 307200, std::ref(f16rng));
-  std::generate(w166, w166 + 320, std::ref(f16rng));
-  std::generate(w167, w167 + 409600, std::ref(f16rng));
-  std::generate(w168, w168 + 1280, std::ref(f16rng));
-  std::generate(w169, w169 + 1281280, std::ref(f16rng));
-  std::generate(w170, w170 + 1001, std::ref(f16rng));
+  std::generate(v0.begin(), v0.end(), std::ref(f16rng));
+  std::generate(v1.begin(), v1.end(), std::ref(f16rng));
+  std::generate(v2.begin(), v2.end(), std::ref(f16rng));
+  std::generate(v3.begin(), v3.end(), std::ref(f16rng));
+  std::generate(v4.begin(), v4.end(), std::ref(f16rng));
+  std::generate(v5.begin(), v5.end(), std::ref(f16rng));
+  std::generate(v6.begin(), v6.end(), std::ref(f16rng));
+  std::generate(v7.begin(), v7.end(), std::ref(f16rng));
+  std::generate(v8.begin(), v8.end(), std::ref(f16rng));
+  std::generate(v9.begin(), v9.end(), std::ref(f16rng));
+  std::generate(v10.begin(), v10.end(), std::ref(f16rng));
+  std::generate(v11.begin(), v11.end(), std::ref(f16rng));
+  std::generate(v12.begin(), v12.end(), std::ref(f16rng));
+  std::generate(v13.begin(), v13.end(), std::ref(f16rng));
+  std::generate(v14.begin(), v14.end(), std::ref(f16rng));
+  std::generate(v15.begin(), v15.end(), std::ref(f16rng));
+  std::generate(v16.begin(), v16.end(), std::ref(f16rng));
+  std::generate(v17.begin(), v17.end(), std::ref(f16rng));
+  std::generate(v18.begin(), v18.end(), std::ref(f16rng));
+  std::generate(v19.begin(), v19.end(), std::ref(f16rng));
+  std::generate(v20.begin(), v20.end(), std::ref(f16rng));
+  std::generate(v21.begin(), v21.end(), std::ref(f16rng));
+  std::generate(v22.begin(), v22.end(), std::ref(f16rng));
+  std::generate(v23.begin(), v23.end(), std::ref(f16rng));
+  std::generate(v24.begin(), v24.end(), std::ref(f16rng));
+  std::generate(v25.begin(), v25.end(), std::ref(f16rng));
+  std::generate(v26.begin(), v26.end(), std::ref(f16rng));
+  std::generate(v27.begin(), v27.end(), std::ref(f16rng));
+  std::generate(v28.begin(), v28.end(), std::ref(f16rng));
+  std::generate(v29.begin(), v29.end(), std::ref(f16rng));
+  std::generate(v30.begin(), v30.end(), std::ref(f16rng));
+  std::generate(v31.begin(), v31.end(), std::ref(f16rng));
+  std::generate(v32.begin(), v32.end(), std::ref(f16rng));
+  std::generate(v33.begin(), v33.end(), std::ref(f16rng));
+  std::generate(v34.begin(), v34.end(), std::ref(f16rng));
+  std::generate(v35.begin(), v35.end(), std::ref(f16rng));
+  std::generate(v36.begin(), v36.end(), std::ref(f16rng));
+  std::generate(v37.begin(), v37.end(), std::ref(f16rng));
+  std::generate(v38.begin(), v38.end(), std::ref(f16rng));
+  std::generate(v39.begin(), v39.end(), std::ref(f16rng));
+  std::generate(v40.begin(), v40.end(), std::ref(f16rng));
+  std::generate(v41.begin(), v41.end(), std::ref(f16rng));
+  std::generate(v42.begin(), v42.end(), std::ref(f16rng));
+  std::generate(v43.begin(), v43.end(), std::ref(f16rng));
+  std::generate(v44.begin(), v44.end(), std::ref(f16rng));
+  std::generate(v45.begin(), v45.end(), std::ref(f16rng));
+  std::generate(v46.begin(), v46.end(), std::ref(f16rng));
+  std::generate(v47.begin(), v47.end(), std::ref(f16rng));
+  std::generate(v48.begin(), v48.end(), std::ref(f16rng));
+  std::generate(v49.begin(), v49.end(), std::ref(f16rng));
+  std::generate(v50.begin(), v50.end(), std::ref(f16rng));
+  std::generate(v51.begin(), v51.end(), std::ref(f16rng));
+  std::generate(v52.begin(), v52.end(), std::ref(f16rng));
+  std::generate(v53.begin(), v53.end(), std::ref(f16rng));
+  std::generate(v54.begin(), v54.end(), std::ref(f16rng));
+  std::generate(v55.begin(), v55.end(), std::ref(f16rng));
+  std::generate(v56.begin(), v56.end(), std::ref(f16rng));
+  std::generate(v57.begin(), v57.end(), std::ref(f16rng));
+  std::generate(v58.begin(), v58.end(), std::ref(f16rng));
+  std::generate(v59.begin(), v59.end(), std::ref(f16rng));
+  std::generate(v60.begin(), v60.end(), std::ref(f16rng));
+  std::generate(v61.begin(), v61.end(), std::ref(f16rng));
+  std::generate(v62.begin(), v62.end(), std::ref(f16rng));
+  std::generate(v63.begin(), v63.end(), std::ref(f16rng));
+  std::generate(v64.begin(), v64.end(), std::ref(f16rng));
+  std::generate(w65.begin(), w65.end(), std::ref(f16rng));
+  std::generate(w66.begin(), w66.end(), std::ref(f16rng));
+  std::generate(w67.begin(), w67.end(), std::ref(f16rng));
+  std::generate(w68.begin(), w68.end(), std::ref(f16rng));
+  std::generate(w69.begin(), w69.end(), std::ref(f16rng));
+  std::generate(w70.begin(), w70.end(), std::ref(f16rng));
+  std::generate(w71.begin(), w71.end(), std::ref(f16rng));
+  std::generate(w72.begin(), w72.end(), std::ref(f16rng));
+  std::generate(w73.begin(), w73.end(), std::ref(f16rng));
+  std::generate(w74.begin(), w74.end(), std::ref(f16rng));
+  std::generate(w75.begin(), w75.end(), std::ref(f16rng));
+  std::generate(w76.begin(), w76.end(), std::ref(f16rng));
+  std::generate(w77.begin(), w77.end(), std::ref(f16rng));
+  std::generate(w78.begin(), w78.end(), std::ref(f16rng));
+  std::generate(w79.begin(), w79.end(), std::ref(f16rng));
+  std::generate(w80.begin(), w80.end(), std::ref(f16rng));
+  std::generate(w81.begin(), w81.end(), std::ref(f16rng));
+  std::generate(w82.begin(), w82.end(), std::ref(f16rng));
+  std::generate(w83.begin(), w83.end(), std::ref(f16rng));
+  std::generate(w84.begin(), w84.end(), std::ref(f16rng));
+  std::generate(w85.begin(), w85.end(), std::ref(f16rng));
+  std::generate(w86.begin(), w86.end(), std::ref(f16rng));
+  std::generate(w87.begin(), w87.end(), std::ref(f16rng));
+  std::generate(w88.begin(), w88.end(), std::ref(f16rng));
+  std::generate(w89.begin(), w89.end(), std::ref(f16rng));
+  std::generate(w90.begin(), w90.end(), std::ref(f16rng));
+  std::generate(w91.begin(), w91.end(), std::ref(f16rng));
+  std::generate(w92.begin(), w92.end(), std::ref(f16rng));
+  std::generate(w93.begin(), w93.end(), std::ref(f16rng));
+  std::generate(w94.begin(), w94.end(), std::ref(f16rng));
+  std::generate(w95.begin(), w95.end(), std::ref(f16rng));
+  std::generate(w96.begin(), w96.end(), std::ref(f16rng));
+  std::generate(w97.begin(), w97.end(), std::ref(f16rng));
+  std::generate(w98.begin(), w98.end(), std::ref(f16rng));
+  std::generate(w99.begin(), w99.end(), std::ref(f16rng));
+  std::generate(w100.begin(), w100.end(), std::ref(f16rng));
+  std::generate(w101.begin(), w101.end(), std::ref(f16rng));
+  std::generate(w102.begin(), w102.end(), std::ref(f16rng));
+  std::generate(w103.begin(), w103.end(), std::ref(f16rng));
+  std::generate(w104.begin(), w104.end(), std::ref(f16rng));
+  std::generate(w105.begin(), w105.end(), std::ref(f16rng));
+  std::generate(w106.begin(), w106.end(), std::ref(f16rng));
+  std::generate(w107.begin(), w107.end(), std::ref(f16rng));
+  std::generate(w108.begin(), w108.end(), std::ref(f16rng));
+  std::generate(w109.begin(), w109.end(), std::ref(f16rng));
+  std::generate(w110.begin(), w110.end(), std::ref(f16rng));
+  std::generate(w111.begin(), w111.end(), std::ref(f16rng));
+  std::generate(w112.begin(), w112.end(), std::ref(f16rng));
+  std::generate(w113.begin(), w113.end(), std::ref(f16rng));
+  std::generate(w114.begin(), w114.end(), std::ref(f16rng));
+  std::generate(w115.begin(), w115.end(), std::ref(f16rng));
+  std::generate(w116.begin(), w116.end(), std::ref(f16rng));
+  std::generate(w117.begin(), w117.end(), std::ref(f16rng));
+  std::generate(w118.begin(), w118.end(), std::ref(f16rng));
+  std::generate(w119.begin(), w119.end(), std::ref(f16rng));
+  std::generate(w120.begin(), w120.end(), std::ref(f16rng));
+  std::generate(w121.begin(), w121.end(), std::ref(f16rng));
+  std::generate(w122.begin(), w122.end(), std::ref(f16rng));
+  std::generate(w123.begin(), w123.end(), std::ref(f16rng));
+  std::generate(w124.begin(), w124.end(), std::ref(f16rng));
+  std::generate(w125.begin(), w125.end(), std::ref(f16rng));
+  std::generate(w126.begin(), w126.end(), std::ref(f16rng));
+  std::generate(w127.begin(), w127.end(), std::ref(f16rng));
+  std::generate(w128.begin(), w128.end(), std::ref(f16rng));
+  std::generate(w129.begin(), w129.end(), std::ref(f16rng));
+  std::generate(w130.begin(), w130.end(), std::ref(f16rng));
+  std::generate(w131.begin(), w131.end(), std::ref(f16rng));
+  std::generate(w132.begin(), w132.end(), std::ref(f16rng));
+  std::generate(w133.begin(), w133.end(), std::ref(f16rng));
+  std::generate(w134.begin(), w134.end(), std::ref(f16rng));
+  std::generate(w135.begin(), w135.end(), std::ref(f16rng));
+  std::generate(w136.begin(), w136.end(), std::ref(f16rng));
+  std::generate(w137.begin(), w137.end(), std::ref(f16rng));
+  std::generate(w138.begin(), w138.end(), std::ref(f16rng));
+  std::generate(w139.begin(), w139.end(), std::ref(f16rng));
+  std::generate(w140.begin(), w140.end(), std::ref(f16rng));
+  std::generate(w141.begin(), w141.end(), std::ref(f16rng));
+  std::generate(w142.begin(), w142.end(), std::ref(f16rng));
+  std::generate(w143.begin(), w143.end(), std::ref(f16rng));
+  std::generate(w144.begin(), w144.end(), std::ref(f16rng));
+  std::generate(w145.begin(), w145.end(), std::ref(f16rng));
+  std::generate(w146.begin(), w146.end(), std::ref(f16rng));
+  std::generate(w147.begin(), w147.end(), std::ref(f16rng));
+  std::generate(w148.begin(), w148.end(), std::ref(f16rng));
+  std::generate(w149.begin(), w149.end(), std::ref(f16rng));
+  std::generate(w150.begin(), w150.end(), std::ref(f16rng));
+  std::generate(w151.begin(), w151.end(), std::ref(f16rng));
+  std::generate(w152.begin(), w152.end(), std::ref(f16rng));
+  std::generate(w153.begin(), w153.end(), std::ref(f16rng));
+  std::generate(w154.begin(), w154.end(), std::ref(f16rng));
+  std::generate(w155.begin(), w155.end(), std::ref(f16rng));
+  std::generate(w156.begin(), w156.end(), std::ref(f16rng));
+  std::generate(w157.begin(), w157.end(), std::ref(f16rng));
+  std::generate(w158.begin(), w158.end(), std::ref(f16rng));
+  std::generate(w159.begin(), w159.end(), std::ref(f16rng));
+  std::generate(w160.begin(), w160.end(), std::ref(f16rng));
+  std::generate(w161.begin(), w161.end(), std::ref(f16rng));
+  std::generate(w162.begin(), w162.end(), std::ref(f16rng));
+  std::generate(w163.begin(), w163.end(), std::ref(f16rng));
+  std::generate(w164.begin(), w164.end(), std::ref(f16rng));
+  std::generate(w165.begin(), w165.end(), std::ref(f16rng));
+  std::generate(w166.begin(), w166.end(), std::ref(f16rng));
+  std::generate(w167.begin(), w167.end(), std::ref(f16rng));
+  std::generate(w168.begin(), w168.end(), std::ref(f16rng));
+  std::generate(w169.begin(), w169.end(), std::ref(f16rng));
+  std::generate(w170.begin(), w170.end(), std::ref(f16rng));
 
   ExecutionPlan operators;
   xnn_status status;
@@ -381,7 +382,7 @@
     32 /* output_channels_per_group */,
     3 /* input pixel stride */,
     32 /* output pixel stride */,
-    w65, w66,
+    w65.data(), w66.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op0);
@@ -403,7 +404,7 @@
     1 /* output_channels_per_group */,
     32 /* input pixel stride */,
     32 /* output pixel stride */,
-    w67, w68,
+    w67.data(), w68.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op1);
@@ -425,7 +426,7 @@
     16 /* output_channels_per_group */,
     32 /* input pixel stride */,
     16 /* output pixel stride */,
-    w69, w70,
+    w69.data(), w70.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op2);
@@ -447,7 +448,7 @@
     96 /* output_channels_per_group */,
     16 /* input pixel stride */,
     96 /* output pixel stride */,
-    w71, w72,
+    w71.data(), w72.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op3);
@@ -469,7 +470,7 @@
     1 /* output_channels_per_group */,
     96 /* input pixel stride */,
     96 /* output pixel stride */,
-    w73, w74,
+    w73.data(), w74.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op4);
@@ -491,7 +492,7 @@
     24 /* output_channels_per_group */,
     96 /* input pixel stride */,
     24 /* output pixel stride */,
-    w75, w76,
+    w75.data(), w76.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op5);
@@ -513,7 +514,7 @@
     144 /* output_channels_per_group */,
     24 /* input pixel stride */,
     144 /* output pixel stride */,
-    w77, w78,
+    w77.data(), w78.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op6);
@@ -535,7 +536,7 @@
     1 /* output_channels_per_group */,
     144 /* input pixel stride */,
     144 /* output pixel stride */,
-    w79, w80,
+    w79.data(), w80.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op7);
@@ -557,7 +558,7 @@
     24 /* output_channels_per_group */,
     144 /* input pixel stride */,
     24 /* output pixel stride */,
-    w81, w82,
+    w81.data(), w82.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op8);
@@ -590,7 +591,7 @@
     144 /* output_channels_per_group */,
     24 /* input pixel stride */,
     144 /* output pixel stride */,
-    w83, w84,
+    w83.data(), w84.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op10);
@@ -612,7 +613,7 @@
     1 /* output_channels_per_group */,
     144 /* input pixel stride */,
     144 /* output pixel stride */,
-    w85, w86,
+    w85.data(), w86.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op11);
@@ -634,7 +635,7 @@
     32 /* output_channels_per_group */,
     144 /* input pixel stride */,
     32 /* output pixel stride */,
-    w87, w88,
+    w87.data(), w88.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op12);
@@ -656,7 +657,7 @@
     192 /* output_channels_per_group */,
     32 /* input pixel stride */,
     192 /* output pixel stride */,
-    w89, w90,
+    w89.data(), w90.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op13);
@@ -678,7 +679,7 @@
     1 /* output_channels_per_group */,
     192 /* input pixel stride */,
     192 /* output pixel stride */,
-    w91, w92,
+    w91.data(), w92.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op14);
@@ -700,7 +701,7 @@
     32 /* output_channels_per_group */,
     192 /* input pixel stride */,
     32 /* output pixel stride */,
-    w93, w94,
+    w93.data(), w94.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op15);
@@ -733,7 +734,7 @@
     192 /* output_channels_per_group */,
     32 /* input pixel stride */,
     192 /* output pixel stride */,
-    w95, w96,
+    w95.data(), w96.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op17);
@@ -755,7 +756,7 @@
     1 /* output_channels_per_group */,
     192 /* input pixel stride */,
     192 /* output pixel stride */,
-    w97, w98,
+    w97.data(), w98.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op18);
@@ -777,7 +778,7 @@
     32 /* output_channels_per_group */,
     192 /* input pixel stride */,
     32 /* output pixel stride */,
-    w99, w100,
+    w99.data(), w100.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op19);
@@ -810,7 +811,7 @@
     192 /* output_channels_per_group */,
     32 /* input pixel stride */,
     192 /* output pixel stride */,
-    w101, w102,
+    w101.data(), w102.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op21);
@@ -832,7 +833,7 @@
     1 /* output_channels_per_group */,
     192 /* input pixel stride */,
     192 /* output pixel stride */,
-    w103, w104,
+    w103.data(), w104.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op22);
@@ -854,7 +855,7 @@
     64 /* output_channels_per_group */,
     192 /* input pixel stride */,
     64 /* output pixel stride */,
-    w105, w106,
+    w105.data(), w106.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op23);
@@ -876,7 +877,7 @@
     384 /* output_channels_per_group */,
     64 /* input pixel stride */,
     384 /* output pixel stride */,
-    w107, w108,
+    w107.data(), w108.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op24);
@@ -898,7 +899,7 @@
     1 /* output_channels_per_group */,
     384 /* input pixel stride */,
     384 /* output pixel stride */,
-    w109, w110,
+    w109.data(), w110.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op25);
@@ -920,7 +921,7 @@
     64 /* output_channels_per_group */,
     384 /* input pixel stride */,
     64 /* output pixel stride */,
-    w111, w112,
+    w111.data(), w112.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op26);
@@ -953,7 +954,7 @@
     384 /* output_channels_per_group */,
     64 /* input pixel stride */,
     384 /* output pixel stride */,
-    w113, w114,
+    w113.data(), w114.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op28);
@@ -975,7 +976,7 @@
     1 /* output_channels_per_group */,
     384 /* input pixel stride */,
     384 /* output pixel stride */,
-    w115, w116,
+    w115.data(), w116.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op29);
@@ -997,7 +998,7 @@
     64 /* output_channels_per_group */,
     384 /* input pixel stride */,
     64 /* output pixel stride */,
-    w117, w118,
+    w117.data(), w118.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op30);
@@ -1030,7 +1031,7 @@
     384 /* output_channels_per_group */,
     64 /* input pixel stride */,
     384 /* output pixel stride */,
-    w119, w120,
+    w119.data(), w120.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op32);
@@ -1052,7 +1053,7 @@
     1 /* output_channels_per_group */,
     384 /* input pixel stride */,
     384 /* output pixel stride */,
-    w121, w122,
+    w121.data(), w122.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op33);
@@ -1074,7 +1075,7 @@
     64 /* output_channels_per_group */,
     384 /* input pixel stride */,
     64 /* output pixel stride */,
-    w123, w124,
+    w123.data(), w124.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op34);
@@ -1107,7 +1108,7 @@
     384 /* output_channels_per_group */,
     64 /* input pixel stride */,
     384 /* output pixel stride */,
-    w125, w126,
+    w125.data(), w126.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op36);
@@ -1129,7 +1130,7 @@
     1 /* output_channels_per_group */,
     384 /* input pixel stride */,
     384 /* output pixel stride */,
-    w127, w128,
+    w127.data(), w128.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op37);
@@ -1151,7 +1152,7 @@
     96 /* output_channels_per_group */,
     384 /* input pixel stride */,
     96 /* output pixel stride */,
-    w129, w130,
+    w129.data(), w130.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op38);
@@ -1173,7 +1174,7 @@
     576 /* output_channels_per_group */,
     96 /* input pixel stride */,
     576 /* output pixel stride */,
-    w131, w132,
+    w131.data(), w132.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op39);
@@ -1195,7 +1196,7 @@
     1 /* output_channels_per_group */,
     576 /* input pixel stride */,
     576 /* output pixel stride */,
-    w133, w134,
+    w133.data(), w134.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op40);
@@ -1217,7 +1218,7 @@
     96 /* output_channels_per_group */,
     576 /* input pixel stride */,
     96 /* output pixel stride */,
-    w135, w136,
+    w135.data(), w136.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op41);
@@ -1250,7 +1251,7 @@
     576 /* output_channels_per_group */,
     96 /* input pixel stride */,
     576 /* output pixel stride */,
-    w137, w138,
+    w137.data(), w138.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op43);
@@ -1272,7 +1273,7 @@
     1 /* output_channels_per_group */,
     576 /* input pixel stride */,
     576 /* output pixel stride */,
-    w139, w140,
+    w139.data(), w140.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op44);
@@ -1294,7 +1295,7 @@
     96 /* output_channels_per_group */,
     576 /* input pixel stride */,
     96 /* output pixel stride */,
-    w141, w142,
+    w141.data(), w142.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op45);
@@ -1327,7 +1328,7 @@
     576 /* output_channels_per_group */,
     96 /* input pixel stride */,
     576 /* output pixel stride */,
-    w143, w144,
+    w143.data(), w144.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op47);
@@ -1349,7 +1350,7 @@
     1 /* output_channels_per_group */,
     576 /* input pixel stride */,
     576 /* output pixel stride */,
-    w145, w146,
+    w145.data(), w146.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op48);
@@ -1371,7 +1372,7 @@
     160 /* output_channels_per_group */,
     576 /* input pixel stride */,
     160 /* output pixel stride */,
-    w147, w148,
+    w147.data(), w148.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op49);
@@ -1393,7 +1394,7 @@
     960 /* output_channels_per_group */,
     160 /* input pixel stride */,
     960 /* output pixel stride */,
-    w149, w150,
+    w149.data(), w150.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op50);
@@ -1415,7 +1416,7 @@
     1 /* output_channels_per_group */,
     960 /* input pixel stride */,
     960 /* output pixel stride */,
-    w151, w152,
+    w151.data(), w152.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op51);
@@ -1437,7 +1438,7 @@
     160 /* output_channels_per_group */,
     960 /* input pixel stride */,
     160 /* output pixel stride */,
-    w153, w154,
+    w153.data(), w154.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op52);
@@ -1470,7 +1471,7 @@
     960 /* output_channels_per_group */,
     160 /* input pixel stride */,
     960 /* output pixel stride */,
-    w155, w156,
+    w155.data(), w156.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op54);
@@ -1492,7 +1493,7 @@
     1 /* output_channels_per_group */,
     960 /* input pixel stride */,
     960 /* output pixel stride */,
-    w157, w158,
+    w157.data(), w158.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op55);
@@ -1514,7 +1515,7 @@
     160 /* output_channels_per_group */,
     960 /* input pixel stride */,
     160 /* output pixel stride */,
-    w159, w160,
+    w159.data(), w160.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op56);
@@ -1547,7 +1548,7 @@
     960 /* output_channels_per_group */,
     160 /* input pixel stride */,
     960 /* output pixel stride */,
-    w161, w162,
+    w161.data(), w162.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op58);
@@ -1569,7 +1570,7 @@
     1 /* output_channels_per_group */,
     960 /* input pixel stride */,
     960 /* output pixel stride */,
-    w163, w164,
+    w163.data(), w164.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op59);
@@ -1591,7 +1592,7 @@
     320 /* output_channels_per_group */,
     960 /* input pixel stride */,
     320 /* output pixel stride */,
-    w165, w166,
+    w165.data(), w166.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op60);
@@ -1613,7 +1614,7 @@
     1280 /* output_channels_per_group */,
     320 /* input pixel stride */,
     1280 /* output pixel stride */,
-    w167, w168,
+    w167.data(), w168.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op61);
@@ -1647,7 +1648,7 @@
     1001 /* output_channels_per_group */,
     1280 /* input pixel stride */,
     1001 /* output pixel stride */,
-    w169, w170,
+    w169.data(), w170.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op63);
@@ -1662,7 +1663,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op0,
     1 /* batch size */, 224 /* input height */, 224 /* input width */,
-    v0 /* input */, v1 /* output */,
+    v0.data() /* input */, v1.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #0" << std::endl;
@@ -1672,7 +1673,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op1,
     1 /* batch size */, 112 /* input height */, 112 /* input width */,
-    v1 /* input */, v2 /* output */,
+    v1.data() /* input */, v2.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #1" << std::endl;
@@ -1682,7 +1683,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op2,
     1 /* batch size */, 112 /* input height */, 112 /* input width */,
-    v2 /* input */, v3 /* output */,
+    v2.data() /* input */, v3.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #2" << std::endl;
@@ -1692,7 +1693,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op3,
     1 /* batch size */, 112 /* input height */, 112 /* input width */,
-    v3 /* input */, v4 /* output */,
+    v3.data() /* input */, v4.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #3" << std::endl;
@@ -1702,7 +1703,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op4,
     1 /* batch size */, 112 /* input height */, 112 /* input width */,
-    v4 /* input */, v5 /* output */,
+    v4.data() /* input */, v5.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #4" << std::endl;
@@ -1712,7 +1713,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op5,
     1 /* batch size */, 56 /* input height */, 56 /* input width */,
-    v5 /* input */, v6 /* output */,
+    v5.data() /* input */, v6.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #5" << std::endl;
@@ -1722,7 +1723,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op6,
     1 /* batch size */, 56 /* input height */, 56 /* input width */,
-    v6 /* input */, v7 /* output */,
+    v6.data() /* input */, v7.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #6" << std::endl;
@@ -1732,7 +1733,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op7,
     1 /* batch size */, 56 /* input height */, 56 /* input width */,
-    v7 /* input */, v8 /* output */,
+    v7.data() /* input */, v8.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #7" << std::endl;
@@ -1742,7 +1743,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op8,
     1 /* batch size */, 56 /* input height */, 56 /* input width */,
-    v8 /* input */, v9 /* output */,
+    v8.data() /* input */, v9.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #8" << std::endl;
@@ -1755,7 +1756,7 @@
     status = xnn_setup_add_nd_f16(
       op9,
       4, a_shape, 4, b_shape,
-      v9 /* a */, v6 /* b */, v10 /* output */,
+      v9.data() /* a */, v6.data() /* b */, v10.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -1766,7 +1767,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op10,
     1 /* batch size */, 56 /* input height */, 56 /* input width */,
-    v10 /* input */, v11 /* output */,
+    v10.data() /* input */, v11.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #10" << std::endl;
@@ -1776,7 +1777,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op11,
     1 /* batch size */, 56 /* input height */, 56 /* input width */,
-    v11 /* input */, v12 /* output */,
+    v11.data() /* input */, v12.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #11" << std::endl;
@@ -1786,7 +1787,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op12,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v12 /* input */, v13 /* output */,
+    v12.data() /* input */, v13.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #12" << std::endl;
@@ -1796,7 +1797,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op13,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v13 /* input */, v14 /* output */,
+    v13.data() /* input */, v14.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #13" << std::endl;
@@ -1806,7 +1807,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op14,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v14 /* input */, v15 /* output */,
+    v14.data() /* input */, v15.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #14" << std::endl;
@@ -1816,7 +1817,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op15,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v15 /* input */, v16 /* output */,
+    v15.data() /* input */, v16.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #15" << std::endl;
@@ -1829,7 +1830,7 @@
     status = xnn_setup_add_nd_f16(
       op16,
       4, a_shape, 4, b_shape,
-      v16 /* a */, v13 /* b */, v17 /* output */,
+      v16.data() /* a */, v13.data() /* b */, v17.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -1840,7 +1841,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op17,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v17 /* input */, v18 /* output */,
+    v17.data() /* input */, v18.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #17" << std::endl;
@@ -1850,7 +1851,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op18,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v18 /* input */, v19 /* output */,
+    v18.data() /* input */, v19.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #18" << std::endl;
@@ -1860,7 +1861,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op19,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v19 /* input */, v20 /* output */,
+    v19.data() /* input */, v20.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #19" << std::endl;
@@ -1873,7 +1874,7 @@
     status = xnn_setup_add_nd_f16(
       op20,
       4, a_shape, 4, b_shape,
-      v20 /* a */, v17 /* b */, v21 /* output */,
+      v20.data() /* a */, v17.data() /* b */, v21.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -1884,7 +1885,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op21,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v21 /* input */, v22 /* output */,
+    v21.data() /* input */, v22.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #21" << std::endl;
@@ -1894,7 +1895,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op22,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v22 /* input */, v23 /* output */,
+    v22.data() /* input */, v23.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #22" << std::endl;
@@ -1904,7 +1905,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op23,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v23 /* input */, v24 /* output */,
+    v23.data() /* input */, v24.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #23" << std::endl;
@@ -1914,7 +1915,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op24,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v24 /* input */, v25 /* output */,
+    v24.data() /* input */, v25.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #24" << std::endl;
@@ -1924,7 +1925,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op25,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v25 /* input */, v26 /* output */,
+    v25.data() /* input */, v26.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #25" << std::endl;
@@ -1934,7 +1935,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op26,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v26 /* input */, v27 /* output */,
+    v26.data() /* input */, v27.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #26" << std::endl;
@@ -1947,7 +1948,7 @@
     status = xnn_setup_add_nd_f16(
       op27,
       4, a_shape, 4, b_shape,
-      v27 /* a */, v24 /* b */, v28 /* output */,
+      v27.data() /* a */, v24.data() /* b */, v28.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -1958,7 +1959,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op28,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v28 /* input */, v29 /* output */,
+    v28.data() /* input */, v29.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #28" << std::endl;
@@ -1968,7 +1969,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op29,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v29 /* input */, v30 /* output */,
+    v29.data() /* input */, v30.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #29" << std::endl;
@@ -1978,7 +1979,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op30,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v30 /* input */, v31 /* output */,
+    v30.data() /* input */, v31.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #30" << std::endl;
@@ -1991,7 +1992,7 @@
     status = xnn_setup_add_nd_f16(
       op31,
       4, a_shape, 4, b_shape,
-      v31 /* a */, v28 /* b */, v32 /* output */,
+      v31.data() /* a */, v28.data() /* b */, v32.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2002,7 +2003,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op32,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v32 /* input */, v33 /* output */,
+    v32.data() /* input */, v33.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #32" << std::endl;
@@ -2012,7 +2013,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op33,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v33 /* input */, v34 /* output */,
+    v33.data() /* input */, v34.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #33" << std::endl;
@@ -2022,7 +2023,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op34,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v34 /* input */, v35 /* output */,
+    v34.data() /* input */, v35.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #34" << std::endl;
@@ -2035,7 +2036,7 @@
     status = xnn_setup_add_nd_f16(
       op35,
       4, a_shape, 4, b_shape,
-      v35 /* a */, v32 /* b */, v36 /* output */,
+      v35.data() /* a */, v32.data() /* b */, v36.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2046,7 +2047,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op36,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v36 /* input */, v37 /* output */,
+    v36.data() /* input */, v37.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #36" << std::endl;
@@ -2056,7 +2057,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op37,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v37 /* input */, v38 /* output */,
+    v37.data() /* input */, v38.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #37" << std::endl;
@@ -2066,7 +2067,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op38,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v38 /* input */, v39 /* output */,
+    v38.data() /* input */, v39.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #38" << std::endl;
@@ -2076,7 +2077,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op39,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v39 /* input */, v40 /* output */,
+    v39.data() /* input */, v40.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #39" << std::endl;
@@ -2086,7 +2087,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op40,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v40 /* input */, v41 /* output */,
+    v40.data() /* input */, v41.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #40" << std::endl;
@@ -2096,7 +2097,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op41,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v41 /* input */, v42 /* output */,
+    v41.data() /* input */, v42.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #41" << std::endl;
@@ -2109,7 +2110,7 @@
     status = xnn_setup_add_nd_f16(
       op42,
       4, a_shape, 4, b_shape,
-      v42 /* a */, v39 /* b */, v43 /* output */,
+      v42.data() /* a */, v39.data() /* b */, v43.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2120,7 +2121,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op43,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v43 /* input */, v44 /* output */,
+    v43.data() /* input */, v44.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #43" << std::endl;
@@ -2130,7 +2131,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op44,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v44 /* input */, v45 /* output */,
+    v44.data() /* input */, v45.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #44" << std::endl;
@@ -2140,7 +2141,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op45,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v45 /* input */, v46 /* output */,
+    v45.data() /* input */, v46.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #45" << std::endl;
@@ -2153,7 +2154,7 @@
     status = xnn_setup_add_nd_f16(
       op46,
       4, a_shape, 4, b_shape,
-      v46 /* a */, v43 /* b */, v47 /* output */,
+      v46.data() /* a */, v43.data() /* b */, v47.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2164,7 +2165,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op47,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v47 /* input */, v48 /* output */,
+    v47.data() /* input */, v48.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #47" << std::endl;
@@ -2174,7 +2175,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op48,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v48 /* input */, v49 /* output */,
+    v48.data() /* input */, v49.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #48" << std::endl;
@@ -2184,7 +2185,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op49,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v49 /* input */, v50 /* output */,
+    v49.data() /* input */, v50.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #49" << std::endl;
@@ -2194,7 +2195,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op50,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v50 /* input */, v51 /* output */,
+    v50.data() /* input */, v51.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #50" << std::endl;
@@ -2204,7 +2205,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op51,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v51 /* input */, v52 /* output */,
+    v51.data() /* input */, v52.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #51" << std::endl;
@@ -2214,7 +2215,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op52,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v52 /* input */, v53 /* output */,
+    v52.data() /* input */, v53.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #52" << std::endl;
@@ -2227,7 +2228,7 @@
     status = xnn_setup_add_nd_f16(
       op53,
       4, a_shape, 4, b_shape,
-      v53 /* a */, v50 /* b */, v54 /* output */,
+      v53.data() /* a */, v50.data() /* b */, v54.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2238,7 +2239,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op54,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v54 /* input */, v55 /* output */,
+    v54.data() /* input */, v55.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #54" << std::endl;
@@ -2248,7 +2249,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op55,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v55 /* input */, v56 /* output */,
+    v55.data() /* input */, v56.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #55" << std::endl;
@@ -2258,7 +2259,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op56,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v56 /* input */, v57 /* output */,
+    v56.data() /* input */, v57.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #56" << std::endl;
@@ -2271,7 +2272,7 @@
     status = xnn_setup_add_nd_f16(
       op57,
       4, a_shape, 4, b_shape,
-      v57 /* a */, v54 /* b */, v58 /* output */,
+      v57.data() /* a */, v54.data() /* b */, v58.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2282,7 +2283,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op58,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v58 /* input */, v59 /* output */,
+    v58.data() /* input */, v59.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #58" << std::endl;
@@ -2292,7 +2293,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op59,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v59 /* input */, v60 /* output */,
+    v59.data() /* input */, v60.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #59" << std::endl;
@@ -2302,7 +2303,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op60,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v60 /* input */, v61 /* output */,
+    v60.data() /* input */, v61.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #60" << std::endl;
@@ -2312,7 +2313,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op61,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v61 /* input */, v62 /* output */,
+    v61.data() /* input */, v62.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #61" << std::endl;
@@ -2322,7 +2323,7 @@
   status = xnn_setup_global_average_pooling_nwc_f16(
     op62,
     1 /* batch size */, 49 /* width */,
-    v62 /* input */, v63 /* output */,
+    v62.data() /* input */, v63.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #62" << std::endl;
@@ -2332,7 +2333,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op63,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v63 /* input */, v64 /* output */,
+    v63.data() /* input */, v64.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #63" << std::endl;
diff --git a/models/fp16-mobilenet-v3-large.cc b/models/fp16-mobilenet-v3-large.cc
index 37892f9..7562931 100644
--- a/models/fp16-mobilenet-v3-large.cc
+++ b/models/fp16-mobilenet-v3-large.cc
@@ -5,6 +5,7 @@
 
 #include <xnnpack.h>
 
+#include <array>
 #include <algorithm>
 #include <functional>
 #include <iostream>
@@ -18,495 +19,495 @@
 namespace models {
 
 ExecutionPlan FP16MobileNetV3Large(pthreadpool_t threadpool) {
-  alignas(16) static uint16_t v0[150528];
-  alignas(16) static uint16_t v1[200704];
-  alignas(16) static uint16_t v2[200704];
-  alignas(16) static uint16_t v3[200704];
-  alignas(16) static uint16_t v4[200704];
-  alignas(16) static uint16_t v5[200704];
-  alignas(16) static uint16_t v6[802816];
-  alignas(16) static uint16_t v7[200704];
-  alignas(16) static uint16_t v8[75264];
-  alignas(16) static uint16_t v9[225792];
-  alignas(16) static uint16_t v10[225792];
-  alignas(16) static uint16_t v11[75264];
-  alignas(16) static uint16_t v12[75264];
-  alignas(16) static uint16_t v13[225792];
-  alignas(16) static uint16_t v14[56448];
-  alignas(16) static uint16_t v15[72];
-  alignas(16) static uint16_t v16[24];
-  alignas(16) static uint16_t v17[72];
-  alignas(16) static uint16_t v18[56448];
-  alignas(16) static uint16_t v19[31360];
-  alignas(16) static uint16_t v20[94080];
-  alignas(16) static uint16_t v21[94080];
-  alignas(16) static uint16_t v22[120];
-  alignas(16) static uint16_t v23[32];
-  alignas(16) static uint16_t v24[120];
-  alignas(16) static uint16_t v25[94080];
-  alignas(16) static uint16_t v26[31360];
-  alignas(16) static uint16_t v27[31360];
-  alignas(16) static uint16_t v28[94080];
-  alignas(16) static uint16_t v29[94080];
-  alignas(16) static uint16_t v30[120];
-  alignas(16) static uint16_t v31[32];
-  alignas(16) static uint16_t v32[120];
-  alignas(16) static uint16_t v33[94080];
-  alignas(16) static uint16_t v34[31360];
-  alignas(16) static uint16_t v35[31360];
-  alignas(16) static uint16_t v36[188160];
-  alignas(16) static uint16_t v37[188160];
-  alignas(16) static uint16_t v38[47040];
-  alignas(16) static uint16_t v39[47040];
-  alignas(16) static uint16_t v40[15680];
-  alignas(16) static uint16_t v41[39200];
-  alignas(16) static uint16_t v42[39200];
-  alignas(16) static uint16_t v43[39200];
-  alignas(16) static uint16_t v44[39200];
-  alignas(16) static uint16_t v45[15680];
-  alignas(16) static uint16_t v46[15680];
-  alignas(16) static uint16_t v47[36064];
-  alignas(16) static uint16_t v48[36064];
-  alignas(16) static uint16_t v49[36064];
-  alignas(16) static uint16_t v50[36064];
-  alignas(16) static uint16_t v51[15680];
-  alignas(16) static uint16_t v52[15680];
-  alignas(16) static uint16_t v53[36064];
-  alignas(16) static uint16_t v54[36064];
-  alignas(16) static uint16_t v55[36064];
-  alignas(16) static uint16_t v56[36064];
-  alignas(16) static uint16_t v57[15680];
-  alignas(16) static uint16_t v58[15680];
-  alignas(16) static uint16_t v59[94080];
-  alignas(16) static uint16_t v60[94080];
-  alignas(16) static uint16_t v61[94080];
-  alignas(16) static uint16_t v62[94080];
-  alignas(16) static uint16_t v63[480];
-  alignas(16) static uint16_t v64[120];
-  alignas(16) static uint16_t v65[480];
-  alignas(16) static uint16_t v66[94080];
-  alignas(16) static uint16_t v67[21952];
-  alignas(16) static uint16_t v68[131712];
-  alignas(16) static uint16_t v69[131712];
-  alignas(16) static uint16_t v70[131712];
-  alignas(16) static uint16_t v71[131712];
-  alignas(16) static uint16_t v72[672];
-  alignas(16) static uint16_t v73[168];
-  alignas(16) static uint16_t v74[672];
-  alignas(16) static uint16_t v75[131712];
-  alignas(16) static uint16_t v76[21952];
-  alignas(16) static uint16_t v77[21952];
-  alignas(16) static uint16_t v78[131712];
-  alignas(16) static uint16_t v79[131712];
-  alignas(16) static uint16_t v80[32928];
-  alignas(16) static uint16_t v81[32928];
-  alignas(16) static uint16_t v82[672];
-  alignas(16) static uint16_t v83[168];
-  alignas(16) static uint16_t v84[672];
-  alignas(16) static uint16_t v85[32928];
-  alignas(16) static uint16_t v86[7840];
-  alignas(16) static uint16_t v87[47040];
-  alignas(16) static uint16_t v88[47040];
-  alignas(16) static uint16_t v89[47040];
-  alignas(16) static uint16_t v90[47040];
-  alignas(16) static uint16_t v91[960];
-  alignas(16) static uint16_t v92[240];
-  alignas(16) static uint16_t v93[960];
-  alignas(16) static uint16_t v94[47040];
-  alignas(16) static uint16_t v95[7840];
-  alignas(16) static uint16_t v96[7840];
-  alignas(16) static uint16_t v97[47040];
-  alignas(16) static uint16_t v98[47040];
-  alignas(16) static uint16_t v99[47040];
-  alignas(16) static uint16_t v100[47040];
-  alignas(16) static uint16_t v101[960];
-  alignas(16) static uint16_t v102[240];
-  alignas(16) static uint16_t v103[960];
-  alignas(16) static uint16_t v104[47040];
-  alignas(16) static uint16_t v105[7840];
-  alignas(16) static uint16_t v106[7840];
-  alignas(16) static uint16_t v107[47040];
-  alignas(16) static uint16_t v108[47040];
-  alignas(16) static uint16_t v109[960];
-  alignas(16) static uint16_t v110[1280];
-  alignas(16) static uint16_t v111[1280];
-  alignas(16) static uint16_t v112[1280];
-  alignas(16) static uint16_t v113[1001];
-  alignas(16) static uint16_t w114[432];
-  alignas(16) static uint16_t w115[16];
-  alignas(16) static uint16_t w116[144];
-  alignas(16) static uint16_t w117[16];
-  alignas(16) static uint16_t w118[256];
-  alignas(16) static uint16_t w119[16];
-  alignas(16) static uint16_t w120[1024];
-  alignas(16) static uint16_t w121[64];
-  alignas(16) static uint16_t w122[576];
-  alignas(16) static uint16_t w123[64];
-  alignas(16) static uint16_t w124[1536];
-  alignas(16) static uint16_t w125[24];
-  alignas(16) static uint16_t w126[1728];
-  alignas(16) static uint16_t w127[72];
-  alignas(16) static uint16_t w128[648];
-  alignas(16) static uint16_t w129[72];
-  alignas(16) static uint16_t w130[1728];
-  alignas(16) static uint16_t w131[24];
-  alignas(16) static uint16_t w132[1728];
-  alignas(16) static uint16_t w133[72];
-  alignas(16) static uint16_t w134[1800];
-  alignas(16) static uint16_t w135[72];
-  alignas(16) static uint16_t w136[1728];
-  alignas(16) static uint16_t w137[24];
-  alignas(16) static uint16_t w138[1728];
-  alignas(16) static uint16_t w139[72];
-  alignas(16) static uint16_t w140[2880];
-  alignas(16) static uint16_t w141[40];
-  alignas(16) static uint16_t w142[4800];
-  alignas(16) static uint16_t w143[120];
-  alignas(16) static uint16_t w144[3000];
-  alignas(16) static uint16_t w145[120];
-  alignas(16) static uint16_t w146[3840];
-  alignas(16) static uint16_t w147[32];
-  alignas(16) static uint16_t w148[3840];
-  alignas(16) static uint16_t w149[120];
-  alignas(16) static uint16_t w150[4800];
-  alignas(16) static uint16_t w151[40];
-  alignas(16) static uint16_t w152[4800];
-  alignas(16) static uint16_t w153[120];
-  alignas(16) static uint16_t w154[3000];
-  alignas(16) static uint16_t w155[120];
-  alignas(16) static uint16_t w156[3840];
-  alignas(16) static uint16_t w157[32];
-  alignas(16) static uint16_t w158[3840];
-  alignas(16) static uint16_t w159[120];
-  alignas(16) static uint16_t w160[4800];
-  alignas(16) static uint16_t w161[40];
-  alignas(16) static uint16_t w162[9600];
-  alignas(16) static uint16_t w163[240];
-  alignas(16) static uint16_t w164[2160];
-  alignas(16) static uint16_t w165[240];
-  alignas(16) static uint16_t w166[19200];
-  alignas(16) static uint16_t w167[80];
-  alignas(16) static uint16_t w168[16000];
-  alignas(16) static uint16_t w169[200];
-  alignas(16) static uint16_t w170[1800];
-  alignas(16) static uint16_t w171[200];
-  alignas(16) static uint16_t w172[16000];
-  alignas(16) static uint16_t w173[80];
-  alignas(16) static uint16_t w174[14720];
-  alignas(16) static uint16_t w175[184];
-  alignas(16) static uint16_t w176[1656];
-  alignas(16) static uint16_t w177[184];
-  alignas(16) static uint16_t w178[14720];
-  alignas(16) static uint16_t w179[80];
-  alignas(16) static uint16_t w180[14720];
-  alignas(16) static uint16_t w181[184];
-  alignas(16) static uint16_t w182[1656];
-  alignas(16) static uint16_t w183[184];
-  alignas(16) static uint16_t w184[14720];
-  alignas(16) static uint16_t w185[80];
-  alignas(16) static uint16_t w186[38400];
-  alignas(16) static uint16_t w187[480];
-  alignas(16) static uint16_t w188[4320];
-  alignas(16) static uint16_t w189[480];
-  alignas(16) static uint16_t w190[57600];
-  alignas(16) static uint16_t w191[120];
-  alignas(16) static uint16_t w192[57600];
-  alignas(16) static uint16_t w193[480];
-  alignas(16) static uint16_t w194[53760];
-  alignas(16) static uint16_t w195[112];
-  alignas(16) static uint16_t w196[75264];
-  alignas(16) static uint16_t w197[672];
-  alignas(16) static uint16_t w198[6048];
-  alignas(16) static uint16_t w199[672];
-  alignas(16) static uint16_t w200[112896];
-  alignas(16) static uint16_t w201[168];
-  alignas(16) static uint16_t w202[112896];
-  alignas(16) static uint16_t w203[672];
-  alignas(16) static uint16_t w204[75264];
-  alignas(16) static uint16_t w205[112];
-  alignas(16) static uint16_t w206[75264];
-  alignas(16) static uint16_t w207[672];
-  alignas(16) static uint16_t w208[16800];
-  alignas(16) static uint16_t w209[672];
-  alignas(16) static uint16_t w210[112896];
-  alignas(16) static uint16_t w211[168];
-  alignas(16) static uint16_t w212[112896];
-  alignas(16) static uint16_t w213[672];
-  alignas(16) static uint16_t w214[107520];
-  alignas(16) static uint16_t w215[160];
-  alignas(16) static uint16_t w216[153600];
-  alignas(16) static uint16_t w217[960];
-  alignas(16) static uint16_t w218[24000];
-  alignas(16) static uint16_t w219[960];
-  alignas(16) static uint16_t w220[230400];
-  alignas(16) static uint16_t w221[240];
-  alignas(16) static uint16_t w222[230400];
-  alignas(16) static uint16_t w223[960];
-  alignas(16) static uint16_t w224[153600];
-  alignas(16) static uint16_t w225[160];
-  alignas(16) static uint16_t w226[153600];
-  alignas(16) static uint16_t w227[960];
-  alignas(16) static uint16_t w228[24000];
-  alignas(16) static uint16_t w229[960];
-  alignas(16) static uint16_t w230[230400];
-  alignas(16) static uint16_t w231[240];
-  alignas(16) static uint16_t w232[230400];
-  alignas(16) static uint16_t w233[960];
-  alignas(16) static uint16_t w234[153600];
-  alignas(16) static uint16_t w235[160];
-  alignas(16) static uint16_t w236[153600];
-  alignas(16) static uint16_t w237[960];
-  alignas(16) static uint16_t w238[1228800];
-  alignas(16) static uint16_t w239[1280];
-  alignas(16) static uint16_t w240[1281280];
-  alignas(16) static uint16_t w241[1001];
+  alignas(16) static std::array<uint16_t, 150528> v0;
+  alignas(16) static std::array<uint16_t, 200704> v1;
+  alignas(16) static std::array<uint16_t, 200704> v2;
+  alignas(16) static std::array<uint16_t, 200704> v3;
+  alignas(16) static std::array<uint16_t, 200704> v4;
+  alignas(16) static std::array<uint16_t, 200704> v5;
+  alignas(16) static std::array<uint16_t, 802816> v6;
+  alignas(16) static std::array<uint16_t, 200704> v7;
+  alignas(16) static std::array<uint16_t, 75264> v8;
+  alignas(16) static std::array<uint16_t, 225792> v9;
+  alignas(16) static std::array<uint16_t, 225792> v10;
+  alignas(16) static std::array<uint16_t, 75264> v11;
+  alignas(16) static std::array<uint16_t, 75264> v12;
+  alignas(16) static std::array<uint16_t, 225792> v13;
+  alignas(16) static std::array<uint16_t, 56448> v14;
+  alignas(16) static std::array<uint16_t, 72> v15;
+  alignas(16) static std::array<uint16_t, 24> v16;
+  alignas(16) static std::array<uint16_t, 72> v17;
+  alignas(16) static std::array<uint16_t, 56448> v18;
+  alignas(16) static std::array<uint16_t, 31360> v19;
+  alignas(16) static std::array<uint16_t, 94080> v20;
+  alignas(16) static std::array<uint16_t, 94080> v21;
+  alignas(16) static std::array<uint16_t, 120> v22;
+  alignas(16) static std::array<uint16_t, 32> v23;
+  alignas(16) static std::array<uint16_t, 120> v24;
+  alignas(16) static std::array<uint16_t, 94080> v25;
+  alignas(16) static std::array<uint16_t, 31360> v26;
+  alignas(16) static std::array<uint16_t, 31360> v27;
+  alignas(16) static std::array<uint16_t, 94080> v28;
+  alignas(16) static std::array<uint16_t, 94080> v29;
+  alignas(16) static std::array<uint16_t, 120> v30;
+  alignas(16) static std::array<uint16_t, 32> v31;
+  alignas(16) static std::array<uint16_t, 120> v32;
+  alignas(16) static std::array<uint16_t, 94080> v33;
+  alignas(16) static std::array<uint16_t, 31360> v34;
+  alignas(16) static std::array<uint16_t, 31360> v35;
+  alignas(16) static std::array<uint16_t, 188160> v36;
+  alignas(16) static std::array<uint16_t, 188160> v37;
+  alignas(16) static std::array<uint16_t, 47040> v38;
+  alignas(16) static std::array<uint16_t, 47040> v39;
+  alignas(16) static std::array<uint16_t, 15680> v40;
+  alignas(16) static std::array<uint16_t, 39200> v41;
+  alignas(16) static std::array<uint16_t, 39200> v42;
+  alignas(16) static std::array<uint16_t, 39200> v43;
+  alignas(16) static std::array<uint16_t, 39200> v44;
+  alignas(16) static std::array<uint16_t, 15680> v45;
+  alignas(16) static std::array<uint16_t, 15680> v46;
+  alignas(16) static std::array<uint16_t, 36064> v47;
+  alignas(16) static std::array<uint16_t, 36064> v48;
+  alignas(16) static std::array<uint16_t, 36064> v49;
+  alignas(16) static std::array<uint16_t, 36064> v50;
+  alignas(16) static std::array<uint16_t, 15680> v51;
+  alignas(16) static std::array<uint16_t, 15680> v52;
+  alignas(16) static std::array<uint16_t, 36064> v53;
+  alignas(16) static std::array<uint16_t, 36064> v54;
+  alignas(16) static std::array<uint16_t, 36064> v55;
+  alignas(16) static std::array<uint16_t, 36064> v56;
+  alignas(16) static std::array<uint16_t, 15680> v57;
+  alignas(16) static std::array<uint16_t, 15680> v58;
+  alignas(16) static std::array<uint16_t, 94080> v59;
+  alignas(16) static std::array<uint16_t, 94080> v60;
+  alignas(16) static std::array<uint16_t, 94080> v61;
+  alignas(16) static std::array<uint16_t, 94080> v62;
+  alignas(16) static std::array<uint16_t, 480> v63;
+  alignas(16) static std::array<uint16_t, 120> v64;
+  alignas(16) static std::array<uint16_t, 480> v65;
+  alignas(16) static std::array<uint16_t, 94080> v66;
+  alignas(16) static std::array<uint16_t, 21952> v67;
+  alignas(16) static std::array<uint16_t, 131712> v68;
+  alignas(16) static std::array<uint16_t, 131712> v69;
+  alignas(16) static std::array<uint16_t, 131712> v70;
+  alignas(16) static std::array<uint16_t, 131712> v71;
+  alignas(16) static std::array<uint16_t, 672> v72;
+  alignas(16) static std::array<uint16_t, 168> v73;
+  alignas(16) static std::array<uint16_t, 672> v74;
+  alignas(16) static std::array<uint16_t, 131712> v75;
+  alignas(16) static std::array<uint16_t, 21952> v76;
+  alignas(16) static std::array<uint16_t, 21952> v77;
+  alignas(16) static std::array<uint16_t, 131712> v78;
+  alignas(16) static std::array<uint16_t, 131712> v79;
+  alignas(16) static std::array<uint16_t, 32928> v80;
+  alignas(16) static std::array<uint16_t, 32928> v81;
+  alignas(16) static std::array<uint16_t, 672> v82;
+  alignas(16) static std::array<uint16_t, 168> v83;
+  alignas(16) static std::array<uint16_t, 672> v84;
+  alignas(16) static std::array<uint16_t, 32928> v85;
+  alignas(16) static std::array<uint16_t, 7840> v86;
+  alignas(16) static std::array<uint16_t, 47040> v87;
+  alignas(16) static std::array<uint16_t, 47040> v88;
+  alignas(16) static std::array<uint16_t, 47040> v89;
+  alignas(16) static std::array<uint16_t, 47040> v90;
+  alignas(16) static std::array<uint16_t, 960> v91;
+  alignas(16) static std::array<uint16_t, 240> v92;
+  alignas(16) static std::array<uint16_t, 960> v93;
+  alignas(16) static std::array<uint16_t, 47040> v94;
+  alignas(16) static std::array<uint16_t, 7840> v95;
+  alignas(16) static std::array<uint16_t, 7840> v96;
+  alignas(16) static std::array<uint16_t, 47040> v97;
+  alignas(16) static std::array<uint16_t, 47040> v98;
+  alignas(16) static std::array<uint16_t, 47040> v99;
+  alignas(16) static std::array<uint16_t, 47040> v100;
+  alignas(16) static std::array<uint16_t, 960> v101;
+  alignas(16) static std::array<uint16_t, 240> v102;
+  alignas(16) static std::array<uint16_t, 960> v103;
+  alignas(16) static std::array<uint16_t, 47040> v104;
+  alignas(16) static std::array<uint16_t, 7840> v105;
+  alignas(16) static std::array<uint16_t, 7840> v106;
+  alignas(16) static std::array<uint16_t, 47040> v107;
+  alignas(16) static std::array<uint16_t, 47040> v108;
+  alignas(16) static std::array<uint16_t, 960> v109;
+  alignas(16) static std::array<uint16_t, 1280> v110;
+  alignas(16) static std::array<uint16_t, 1280> v111;
+  alignas(16) static std::array<uint16_t, 1280> v112;
+  alignas(16) static std::array<uint16_t, 1001> v113;
+  alignas(16) static std::array<uint16_t, 432> w114;
+  alignas(16) static std::array<uint16_t, 16> w115;
+  alignas(16) static std::array<uint16_t, 144> w116;
+  alignas(16) static std::array<uint16_t, 16> w117;
+  alignas(16) static std::array<uint16_t, 256> w118;
+  alignas(16) static std::array<uint16_t, 16> w119;
+  alignas(16) static std::array<uint16_t, 1024> w120;
+  alignas(16) static std::array<uint16_t, 64> w121;
+  alignas(16) static std::array<uint16_t, 576> w122;
+  alignas(16) static std::array<uint16_t, 64> w123;
+  alignas(16) static std::array<uint16_t, 1536> w124;
+  alignas(16) static std::array<uint16_t, 24> w125;
+  alignas(16) static std::array<uint16_t, 1728> w126;
+  alignas(16) static std::array<uint16_t, 72> w127;
+  alignas(16) static std::array<uint16_t, 648> w128;
+  alignas(16) static std::array<uint16_t, 72> w129;
+  alignas(16) static std::array<uint16_t, 1728> w130;
+  alignas(16) static std::array<uint16_t, 24> w131;
+  alignas(16) static std::array<uint16_t, 1728> w132;
+  alignas(16) static std::array<uint16_t, 72> w133;
+  alignas(16) static std::array<uint16_t, 1800> w134;
+  alignas(16) static std::array<uint16_t, 72> w135;
+  alignas(16) static std::array<uint16_t, 1728> w136;
+  alignas(16) static std::array<uint16_t, 24> w137;
+  alignas(16) static std::array<uint16_t, 1728> w138;
+  alignas(16) static std::array<uint16_t, 72> w139;
+  alignas(16) static std::array<uint16_t, 2880> w140;
+  alignas(16) static std::array<uint16_t, 40> w141;
+  alignas(16) static std::array<uint16_t, 4800> w142;
+  alignas(16) static std::array<uint16_t, 120> w143;
+  alignas(16) static std::array<uint16_t, 3000> w144;
+  alignas(16) static std::array<uint16_t, 120> w145;
+  alignas(16) static std::array<uint16_t, 3840> w146;
+  alignas(16) static std::array<uint16_t, 32> w147;
+  alignas(16) static std::array<uint16_t, 3840> w148;
+  alignas(16) static std::array<uint16_t, 120> w149;
+  alignas(16) static std::array<uint16_t, 4800> w150;
+  alignas(16) static std::array<uint16_t, 40> w151;
+  alignas(16) static std::array<uint16_t, 4800> w152;
+  alignas(16) static std::array<uint16_t, 120> w153;
+  alignas(16) static std::array<uint16_t, 3000> w154;
+  alignas(16) static std::array<uint16_t, 120> w155;
+  alignas(16) static std::array<uint16_t, 3840> w156;
+  alignas(16) static std::array<uint16_t, 32> w157;
+  alignas(16) static std::array<uint16_t, 3840> w158;
+  alignas(16) static std::array<uint16_t, 120> w159;
+  alignas(16) static std::array<uint16_t, 4800> w160;
+  alignas(16) static std::array<uint16_t, 40> w161;
+  alignas(16) static std::array<uint16_t, 9600> w162;
+  alignas(16) static std::array<uint16_t, 240> w163;
+  alignas(16) static std::array<uint16_t, 2160> w164;
+  alignas(16) static std::array<uint16_t, 240> w165;
+  alignas(16) static std::array<uint16_t, 19200> w166;
+  alignas(16) static std::array<uint16_t, 80> w167;
+  alignas(16) static std::array<uint16_t, 16000> w168;
+  alignas(16) static std::array<uint16_t, 200> w169;
+  alignas(16) static std::array<uint16_t, 1800> w170;
+  alignas(16) static std::array<uint16_t, 200> w171;
+  alignas(16) static std::array<uint16_t, 16000> w172;
+  alignas(16) static std::array<uint16_t, 80> w173;
+  alignas(16) static std::array<uint16_t, 14720> w174;
+  alignas(16) static std::array<uint16_t, 184> w175;
+  alignas(16) static std::array<uint16_t, 1656> w176;
+  alignas(16) static std::array<uint16_t, 184> w177;
+  alignas(16) static std::array<uint16_t, 14720> w178;
+  alignas(16) static std::array<uint16_t, 80> w179;
+  alignas(16) static std::array<uint16_t, 14720> w180;
+  alignas(16) static std::array<uint16_t, 184> w181;
+  alignas(16) static std::array<uint16_t, 1656> w182;
+  alignas(16) static std::array<uint16_t, 184> w183;
+  alignas(16) static std::array<uint16_t, 14720> w184;
+  alignas(16) static std::array<uint16_t, 80> w185;
+  alignas(16) static std::array<uint16_t, 38400> w186;
+  alignas(16) static std::array<uint16_t, 480> w187;
+  alignas(16) static std::array<uint16_t, 4320> w188;
+  alignas(16) static std::array<uint16_t, 480> w189;
+  alignas(16) static std::array<uint16_t, 57600> w190;
+  alignas(16) static std::array<uint16_t, 120> w191;
+  alignas(16) static std::array<uint16_t, 57600> w192;
+  alignas(16) static std::array<uint16_t, 480> w193;
+  alignas(16) static std::array<uint16_t, 53760> w194;
+  alignas(16) static std::array<uint16_t, 112> w195;
+  alignas(16) static std::array<uint16_t, 75264> w196;
+  alignas(16) static std::array<uint16_t, 672> w197;
+  alignas(16) static std::array<uint16_t, 6048> w198;
+  alignas(16) static std::array<uint16_t, 672> w199;
+  alignas(16) static std::array<uint16_t, 112896> w200;
+  alignas(16) static std::array<uint16_t, 168> w201;
+  alignas(16) static std::array<uint16_t, 112896> w202;
+  alignas(16) static std::array<uint16_t, 672> w203;
+  alignas(16) static std::array<uint16_t, 75264> w204;
+  alignas(16) static std::array<uint16_t, 112> w205;
+  alignas(16) static std::array<uint16_t, 75264> w206;
+  alignas(16) static std::array<uint16_t, 672> w207;
+  alignas(16) static std::array<uint16_t, 16800> w208;
+  alignas(16) static std::array<uint16_t, 672> w209;
+  alignas(16) static std::array<uint16_t, 112896> w210;
+  alignas(16) static std::array<uint16_t, 168> w211;
+  alignas(16) static std::array<uint16_t, 112896> w212;
+  alignas(16) static std::array<uint16_t, 672> w213;
+  alignas(16) static std::array<uint16_t, 107520> w214;
+  alignas(16) static std::array<uint16_t, 160> w215;
+  alignas(16) static std::array<uint16_t, 153600> w216;
+  alignas(16) static std::array<uint16_t, 960> w217;
+  alignas(16) static std::array<uint16_t, 24000> w218;
+  alignas(16) static std::array<uint16_t, 960> w219;
+  alignas(16) static std::array<uint16_t, 230400> w220;
+  alignas(16) static std::array<uint16_t, 240> w221;
+  alignas(16) static std::array<uint16_t, 230400> w222;
+  alignas(16) static std::array<uint16_t, 960> w223;
+  alignas(16) static std::array<uint16_t, 153600> w224;
+  alignas(16) static std::array<uint16_t, 160> w225;
+  alignas(16) static std::array<uint16_t, 153600> w226;
+  alignas(16) static std::array<uint16_t, 960> w227;
+  alignas(16) static std::array<uint16_t, 24000> w228;
+  alignas(16) static std::array<uint16_t, 960> w229;
+  alignas(16) static std::array<uint16_t, 230400> w230;
+  alignas(16) static std::array<uint16_t, 240> w231;
+  alignas(16) static std::array<uint16_t, 230400> w232;
+  alignas(16) static std::array<uint16_t, 960> w233;
+  alignas(16) static std::array<uint16_t, 153600> w234;
+  alignas(16) static std::array<uint16_t, 160> w235;
+  alignas(16) static std::array<uint16_t, 153600> w236;
+  alignas(16) static std::array<uint16_t, 960> w237;
+  alignas(16) static std::array<uint16_t, 1228800> w238;
+  alignas(16) static std::array<uint16_t, 1280> w239;
+  alignas(16) static std::array<uint16_t, 1281280> w240;
+  alignas(16) static std::array<uint16_t, 1001> w241;
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.0f, +1.0f), std::ref(rng));
   auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
-  std::generate(v0, v0 + 150528, std::ref(f16rng));
-  std::generate(v1, v1 + 200704, std::ref(f16rng));
-  std::generate(v2, v2 + 200704, std::ref(f16rng));
-  std::generate(v3, v3 + 200704, std::ref(f16rng));
-  std::generate(v4, v4 + 200704, std::ref(f16rng));
-  std::generate(v5, v5 + 200704, std::ref(f16rng));
-  std::generate(v6, v6 + 802816, std::ref(f16rng));
-  std::generate(v7, v7 + 200704, std::ref(f16rng));
-  std::generate(v8, v8 + 75264, std::ref(f16rng));
-  std::generate(v9, v9 + 225792, std::ref(f16rng));
-  std::generate(v10, v10 + 225792, std::ref(f16rng));
-  std::generate(v11, v11 + 75264, std::ref(f16rng));
-  std::generate(v12, v12 + 75264, std::ref(f16rng));
-  std::generate(v13, v13 + 225792, std::ref(f16rng));
-  std::generate(v14, v14 + 56448, std::ref(f16rng));
-  std::generate(v15, v15 + 72, std::ref(f16rng));
-  std::generate(v16, v16 + 24, std::ref(f16rng));
-  std::generate(v17, v17 + 72, std::ref(f16rng));
-  std::generate(v18, v18 + 56448, std::ref(f16rng));
-  std::generate(v19, v19 + 31360, std::ref(f16rng));
-  std::generate(v20, v20 + 94080, std::ref(f16rng));
-  std::generate(v21, v21 + 94080, std::ref(f16rng));
-  std::generate(v22, v22 + 120, std::ref(f16rng));
-  std::generate(v23, v23 + 32, std::ref(f16rng));
-  std::generate(v24, v24 + 120, std::ref(f16rng));
-  std::generate(v25, v25 + 94080, std::ref(f16rng));
-  std::generate(v26, v26 + 31360, std::ref(f16rng));
-  std::generate(v27, v27 + 31360, std::ref(f16rng));
-  std::generate(v28, v28 + 94080, std::ref(f16rng));
-  std::generate(v29, v29 + 94080, std::ref(f16rng));
-  std::generate(v30, v30 + 120, std::ref(f16rng));
-  std::generate(v31, v31 + 32, std::ref(f16rng));
-  std::generate(v32, v32 + 120, std::ref(f16rng));
-  std::generate(v33, v33 + 94080, std::ref(f16rng));
-  std::generate(v34, v34 + 31360, std::ref(f16rng));
-  std::generate(v35, v35 + 31360, std::ref(f16rng));
-  std::generate(v36, v36 + 188160, std::ref(f16rng));
-  std::generate(v37, v37 + 188160, std::ref(f16rng));
-  std::generate(v38, v38 + 47040, std::ref(f16rng));
-  std::generate(v39, v39 + 47040, std::ref(f16rng));
-  std::generate(v40, v40 + 15680, std::ref(f16rng));
-  std::generate(v41, v41 + 39200, std::ref(f16rng));
-  std::generate(v42, v42 + 39200, std::ref(f16rng));
-  std::generate(v43, v43 + 39200, std::ref(f16rng));
-  std::generate(v44, v44 + 39200, std::ref(f16rng));
-  std::generate(v45, v45 + 15680, std::ref(f16rng));
-  std::generate(v46, v46 + 15680, std::ref(f16rng));
-  std::generate(v47, v47 + 36064, std::ref(f16rng));
-  std::generate(v48, v48 + 36064, std::ref(f16rng));
-  std::generate(v49, v49 + 36064, std::ref(f16rng));
-  std::generate(v50, v50 + 36064, std::ref(f16rng));
-  std::generate(v51, v51 + 15680, std::ref(f16rng));
-  std::generate(v52, v52 + 15680, std::ref(f16rng));
-  std::generate(v53, v53 + 36064, std::ref(f16rng));
-  std::generate(v54, v54 + 36064, std::ref(f16rng));
-  std::generate(v55, v55 + 36064, std::ref(f16rng));
-  std::generate(v56, v56 + 36064, std::ref(f16rng));
-  std::generate(v57, v57 + 15680, std::ref(f16rng));
-  std::generate(v58, v58 + 15680, std::ref(f16rng));
-  std::generate(v59, v59 + 94080, std::ref(f16rng));
-  std::generate(v60, v60 + 94080, std::ref(f16rng));
-  std::generate(v61, v61 + 94080, std::ref(f16rng));
-  std::generate(v62, v62 + 94080, std::ref(f16rng));
-  std::generate(v63, v63 + 480, std::ref(f16rng));
-  std::generate(v64, v64 + 120, std::ref(f16rng));
-  std::generate(v65, v65 + 480, std::ref(f16rng));
-  std::generate(v66, v66 + 94080, std::ref(f16rng));
-  std::generate(v67, v67 + 21952, std::ref(f16rng));
-  std::generate(v68, v68 + 131712, std::ref(f16rng));
-  std::generate(v69, v69 + 131712, std::ref(f16rng));
-  std::generate(v70, v70 + 131712, std::ref(f16rng));
-  std::generate(v71, v71 + 131712, std::ref(f16rng));
-  std::generate(v72, v72 + 672, std::ref(f16rng));
-  std::generate(v73, v73 + 168, std::ref(f16rng));
-  std::generate(v74, v74 + 672, std::ref(f16rng));
-  std::generate(v75, v75 + 131712, std::ref(f16rng));
-  std::generate(v76, v76 + 21952, std::ref(f16rng));
-  std::generate(v77, v77 + 21952, std::ref(f16rng));
-  std::generate(v78, v78 + 131712, std::ref(f16rng));
-  std::generate(v79, v79 + 131712, std::ref(f16rng));
-  std::generate(v80, v80 + 32928, std::ref(f16rng));
-  std::generate(v81, v81 + 32928, std::ref(f16rng));
-  std::generate(v82, v82 + 672, std::ref(f16rng));
-  std::generate(v83, v83 + 168, std::ref(f16rng));
-  std::generate(v84, v84 + 672, std::ref(f16rng));
-  std::generate(v85, v85 + 32928, std::ref(f16rng));
-  std::generate(v86, v86 + 7840, std::ref(f16rng));
-  std::generate(v87, v87 + 47040, std::ref(f16rng));
-  std::generate(v88, v88 + 47040, std::ref(f16rng));
-  std::generate(v89, v89 + 47040, std::ref(f16rng));
-  std::generate(v90, v90 + 47040, std::ref(f16rng));
-  std::generate(v91, v91 + 960, std::ref(f16rng));
-  std::generate(v92, v92 + 240, std::ref(f16rng));
-  std::generate(v93, v93 + 960, std::ref(f16rng));
-  std::generate(v94, v94 + 47040, std::ref(f16rng));
-  std::generate(v95, v95 + 7840, std::ref(f16rng));
-  std::generate(v96, v96 + 7840, std::ref(f16rng));
-  std::generate(v97, v97 + 47040, std::ref(f16rng));
-  std::generate(v98, v98 + 47040, std::ref(f16rng));
-  std::generate(v99, v99 + 47040, std::ref(f16rng));
-  std::generate(v100, v100 + 47040, std::ref(f16rng));
-  std::generate(v101, v101 + 960, std::ref(f16rng));
-  std::generate(v102, v102 + 240, std::ref(f16rng));
-  std::generate(v103, v103 + 960, std::ref(f16rng));
-  std::generate(v104, v104 + 47040, std::ref(f16rng));
-  std::generate(v105, v105 + 7840, std::ref(f16rng));
-  std::generate(v106, v106 + 7840, std::ref(f16rng));
-  std::generate(v107, v107 + 47040, std::ref(f16rng));
-  std::generate(v108, v108 + 47040, std::ref(f16rng));
-  std::generate(v109, v109 + 960, std::ref(f16rng));
-  std::generate(v110, v110 + 1280, std::ref(f16rng));
-  std::generate(v111, v111 + 1280, std::ref(f16rng));
-  std::generate(v112, v112 + 1280, std::ref(f16rng));
-  std::generate(v113, v113 + 1001, std::ref(f16rng));
-  std::generate(w114, w114 + 432, std::ref(f16rng));
-  std::generate(w115, w115 + 16, std::ref(f16rng));
-  std::generate(w116, w116 + 144, std::ref(f16rng));
-  std::generate(w117, w117 + 16, std::ref(f16rng));
-  std::generate(w118, w118 + 256, std::ref(f16rng));
-  std::generate(w119, w119 + 16, std::ref(f16rng));
-  std::generate(w120, w120 + 1024, std::ref(f16rng));
-  std::generate(w121, w121 + 64, std::ref(f16rng));
-  std::generate(w122, w122 + 576, std::ref(f16rng));
-  std::generate(w123, w123 + 64, std::ref(f16rng));
-  std::generate(w124, w124 + 1536, std::ref(f16rng));
-  std::generate(w125, w125 + 24, std::ref(f16rng));
-  std::generate(w126, w126 + 1728, std::ref(f16rng));
-  std::generate(w127, w127 + 72, std::ref(f16rng));
-  std::generate(w128, w128 + 648, std::ref(f16rng));
-  std::generate(w129, w129 + 72, std::ref(f16rng));
-  std::generate(w130, w130 + 1728, std::ref(f16rng));
-  std::generate(w131, w131 + 24, std::ref(f16rng));
-  std::generate(w132, w132 + 1728, std::ref(f16rng));
-  std::generate(w133, w133 + 72, std::ref(f16rng));
-  std::generate(w134, w134 + 1800, std::ref(f16rng));
-  std::generate(w135, w135 + 72, std::ref(f16rng));
-  std::generate(w136, w136 + 1728, std::ref(f16rng));
-  std::generate(w137, w137 + 24, std::ref(f16rng));
-  std::generate(w138, w138 + 1728, std::ref(f16rng));
-  std::generate(w139, w139 + 72, std::ref(f16rng));
-  std::generate(w140, w140 + 2880, std::ref(f16rng));
-  std::generate(w141, w141 + 40, std::ref(f16rng));
-  std::generate(w142, w142 + 4800, std::ref(f16rng));
-  std::generate(w143, w143 + 120, std::ref(f16rng));
-  std::generate(w144, w144 + 3000, std::ref(f16rng));
-  std::generate(w145, w145 + 120, std::ref(f16rng));
-  std::generate(w146, w146 + 3840, std::ref(f16rng));
-  std::generate(w147, w147 + 32, std::ref(f16rng));
-  std::generate(w148, w148 + 3840, std::ref(f16rng));
-  std::generate(w149, w149 + 120, std::ref(f16rng));
-  std::generate(w150, w150 + 4800, std::ref(f16rng));
-  std::generate(w151, w151 + 40, std::ref(f16rng));
-  std::generate(w152, w152 + 4800, std::ref(f16rng));
-  std::generate(w153, w153 + 120, std::ref(f16rng));
-  std::generate(w154, w154 + 3000, std::ref(f16rng));
-  std::generate(w155, w155 + 120, std::ref(f16rng));
-  std::generate(w156, w156 + 3840, std::ref(f16rng));
-  std::generate(w157, w157 + 32, std::ref(f16rng));
-  std::generate(w158, w158 + 3840, std::ref(f16rng));
-  std::generate(w159, w159 + 120, std::ref(f16rng));
-  std::generate(w160, w160 + 4800, std::ref(f16rng));
-  std::generate(w161, w161 + 40, std::ref(f16rng));
-  std::generate(w162, w162 + 9600, std::ref(f16rng));
-  std::generate(w163, w163 + 240, std::ref(f16rng));
-  std::generate(w164, w164 + 2160, std::ref(f16rng));
-  std::generate(w165, w165 + 240, std::ref(f16rng));
-  std::generate(w166, w166 + 19200, std::ref(f16rng));
-  std::generate(w167, w167 + 80, std::ref(f16rng));
-  std::generate(w168, w168 + 16000, std::ref(f16rng));
-  std::generate(w169, w169 + 200, std::ref(f16rng));
-  std::generate(w170, w170 + 1800, std::ref(f16rng));
-  std::generate(w171, w171 + 200, std::ref(f16rng));
-  std::generate(w172, w172 + 16000, std::ref(f16rng));
-  std::generate(w173, w173 + 80, std::ref(f16rng));
-  std::generate(w174, w174 + 14720, std::ref(f16rng));
-  std::generate(w175, w175 + 184, std::ref(f16rng));
-  std::generate(w176, w176 + 1656, std::ref(f16rng));
-  std::generate(w177, w177 + 184, std::ref(f16rng));
-  std::generate(w178, w178 + 14720, std::ref(f16rng));
-  std::generate(w179, w179 + 80, std::ref(f16rng));
-  std::generate(w180, w180 + 14720, std::ref(f16rng));
-  std::generate(w181, w181 + 184, std::ref(f16rng));
-  std::generate(w182, w182 + 1656, std::ref(f16rng));
-  std::generate(w183, w183 + 184, std::ref(f16rng));
-  std::generate(w184, w184 + 14720, std::ref(f16rng));
-  std::generate(w185, w185 + 80, std::ref(f16rng));
-  std::generate(w186, w186 + 38400, std::ref(f16rng));
-  std::generate(w187, w187 + 480, std::ref(f16rng));
-  std::generate(w188, w188 + 4320, std::ref(f16rng));
-  std::generate(w189, w189 + 480, std::ref(f16rng));
-  std::generate(w190, w190 + 57600, std::ref(f16rng));
-  std::generate(w191, w191 + 120, std::ref(f16rng));
-  std::generate(w192, w192 + 57600, std::ref(f16rng));
-  std::generate(w193, w193 + 480, std::ref(f16rng));
-  std::generate(w194, w194 + 53760, std::ref(f16rng));
-  std::generate(w195, w195 + 112, std::ref(f16rng));
-  std::generate(w196, w196 + 75264, std::ref(f16rng));
-  std::generate(w197, w197 + 672, std::ref(f16rng));
-  std::generate(w198, w198 + 6048, std::ref(f16rng));
-  std::generate(w199, w199 + 672, std::ref(f16rng));
-  std::generate(w200, w200 + 112896, std::ref(f16rng));
-  std::generate(w201, w201 + 168, std::ref(f16rng));
-  std::generate(w202, w202 + 112896, std::ref(f16rng));
-  std::generate(w203, w203 + 672, std::ref(f16rng));
-  std::generate(w204, w204 + 75264, std::ref(f16rng));
-  std::generate(w205, w205 + 112, std::ref(f16rng));
-  std::generate(w206, w206 + 75264, std::ref(f16rng));
-  std::generate(w207, w207 + 672, std::ref(f16rng));
-  std::generate(w208, w208 + 16800, std::ref(f16rng));
-  std::generate(w209, w209 + 672, std::ref(f16rng));
-  std::generate(w210, w210 + 112896, std::ref(f16rng));
-  std::generate(w211, w211 + 168, std::ref(f16rng));
-  std::generate(w212, w212 + 112896, std::ref(f16rng));
-  std::generate(w213, w213 + 672, std::ref(f16rng));
-  std::generate(w214, w214 + 107520, std::ref(f16rng));
-  std::generate(w215, w215 + 160, std::ref(f16rng));
-  std::generate(w216, w216 + 153600, std::ref(f16rng));
-  std::generate(w217, w217 + 960, std::ref(f16rng));
-  std::generate(w218, w218 + 24000, std::ref(f16rng));
-  std::generate(w219, w219 + 960, std::ref(f16rng));
-  std::generate(w220, w220 + 230400, std::ref(f16rng));
-  std::generate(w221, w221 + 240, std::ref(f16rng));
-  std::generate(w222, w222 + 230400, std::ref(f16rng));
-  std::generate(w223, w223 + 960, std::ref(f16rng));
-  std::generate(w224, w224 + 153600, std::ref(f16rng));
-  std::generate(w225, w225 + 160, std::ref(f16rng));
-  std::generate(w226, w226 + 153600, std::ref(f16rng));
-  std::generate(w227, w227 + 960, std::ref(f16rng));
-  std::generate(w228, w228 + 24000, std::ref(f16rng));
-  std::generate(w229, w229 + 960, std::ref(f16rng));
-  std::generate(w230, w230 + 230400, std::ref(f16rng));
-  std::generate(w231, w231 + 240, std::ref(f16rng));
-  std::generate(w232, w232 + 230400, std::ref(f16rng));
-  std::generate(w233, w233 + 960, std::ref(f16rng));
-  std::generate(w234, w234 + 153600, std::ref(f16rng));
-  std::generate(w235, w235 + 160, std::ref(f16rng));
-  std::generate(w236, w236 + 153600, std::ref(f16rng));
-  std::generate(w237, w237 + 960, std::ref(f16rng));
-  std::generate(w238, w238 + 1228800, std::ref(f16rng));
-  std::generate(w239, w239 + 1280, std::ref(f16rng));
-  std::generate(w240, w240 + 1281280, std::ref(f16rng));
-  std::generate(w241, w241 + 1001, std::ref(f16rng));
+  std::generate(v0.begin(), v0.end(), std::ref(f16rng));
+  std::generate(v1.begin(), v1.end(), std::ref(f16rng));
+  std::generate(v2.begin(), v2.end(), std::ref(f16rng));
+  std::generate(v3.begin(), v3.end(), std::ref(f16rng));
+  std::generate(v4.begin(), v4.end(), std::ref(f16rng));
+  std::generate(v5.begin(), v5.end(), std::ref(f16rng));
+  std::generate(v6.begin(), v6.end(), std::ref(f16rng));
+  std::generate(v7.begin(), v7.end(), std::ref(f16rng));
+  std::generate(v8.begin(), v8.end(), std::ref(f16rng));
+  std::generate(v9.begin(), v9.end(), std::ref(f16rng));
+  std::generate(v10.begin(), v10.end(), std::ref(f16rng));
+  std::generate(v11.begin(), v11.end(), std::ref(f16rng));
+  std::generate(v12.begin(), v12.end(), std::ref(f16rng));
+  std::generate(v13.begin(), v13.end(), std::ref(f16rng));
+  std::generate(v14.begin(), v14.end(), std::ref(f16rng));
+  std::generate(v15.begin(), v15.end(), std::ref(f16rng));
+  std::generate(v16.begin(), v16.end(), std::ref(f16rng));
+  std::generate(v17.begin(), v17.end(), std::ref(f16rng));
+  std::generate(v18.begin(), v18.end(), std::ref(f16rng));
+  std::generate(v19.begin(), v19.end(), std::ref(f16rng));
+  std::generate(v20.begin(), v20.end(), std::ref(f16rng));
+  std::generate(v21.begin(), v21.end(), std::ref(f16rng));
+  std::generate(v22.begin(), v22.end(), std::ref(f16rng));
+  std::generate(v23.begin(), v23.end(), std::ref(f16rng));
+  std::generate(v24.begin(), v24.end(), std::ref(f16rng));
+  std::generate(v25.begin(), v25.end(), std::ref(f16rng));
+  std::generate(v26.begin(), v26.end(), std::ref(f16rng));
+  std::generate(v27.begin(), v27.end(), std::ref(f16rng));
+  std::generate(v28.begin(), v28.end(), std::ref(f16rng));
+  std::generate(v29.begin(), v29.end(), std::ref(f16rng));
+  std::generate(v30.begin(), v30.end(), std::ref(f16rng));
+  std::generate(v31.begin(), v31.end(), std::ref(f16rng));
+  std::generate(v32.begin(), v32.end(), std::ref(f16rng));
+  std::generate(v33.begin(), v33.end(), std::ref(f16rng));
+  std::generate(v34.begin(), v34.end(), std::ref(f16rng));
+  std::generate(v35.begin(), v35.end(), std::ref(f16rng));
+  std::generate(v36.begin(), v36.end(), std::ref(f16rng));
+  std::generate(v37.begin(), v37.end(), std::ref(f16rng));
+  std::generate(v38.begin(), v38.end(), std::ref(f16rng));
+  std::generate(v39.begin(), v39.end(), std::ref(f16rng));
+  std::generate(v40.begin(), v40.end(), std::ref(f16rng));
+  std::generate(v41.begin(), v41.end(), std::ref(f16rng));
+  std::generate(v42.begin(), v42.end(), std::ref(f16rng));
+  std::generate(v43.begin(), v43.end(), std::ref(f16rng));
+  std::generate(v44.begin(), v44.end(), std::ref(f16rng));
+  std::generate(v45.begin(), v45.end(), std::ref(f16rng));
+  std::generate(v46.begin(), v46.end(), std::ref(f16rng));
+  std::generate(v47.begin(), v47.end(), std::ref(f16rng));
+  std::generate(v48.begin(), v48.end(), std::ref(f16rng));
+  std::generate(v49.begin(), v49.end(), std::ref(f16rng));
+  std::generate(v50.begin(), v50.end(), std::ref(f16rng));
+  std::generate(v51.begin(), v51.end(), std::ref(f16rng));
+  std::generate(v52.begin(), v52.end(), std::ref(f16rng));
+  std::generate(v53.begin(), v53.end(), std::ref(f16rng));
+  std::generate(v54.begin(), v54.end(), std::ref(f16rng));
+  std::generate(v55.begin(), v55.end(), std::ref(f16rng));
+  std::generate(v56.begin(), v56.end(), std::ref(f16rng));
+  std::generate(v57.begin(), v57.end(), std::ref(f16rng));
+  std::generate(v58.begin(), v58.end(), std::ref(f16rng));
+  std::generate(v59.begin(), v59.end(), std::ref(f16rng));
+  std::generate(v60.begin(), v60.end(), std::ref(f16rng));
+  std::generate(v61.begin(), v61.end(), std::ref(f16rng));
+  std::generate(v62.begin(), v62.end(), std::ref(f16rng));
+  std::generate(v63.begin(), v63.end(), std::ref(f16rng));
+  std::generate(v64.begin(), v64.end(), std::ref(f16rng));
+  std::generate(v65.begin(), v65.end(), std::ref(f16rng));
+  std::generate(v66.begin(), v66.end(), std::ref(f16rng));
+  std::generate(v67.begin(), v67.end(), std::ref(f16rng));
+  std::generate(v68.begin(), v68.end(), std::ref(f16rng));
+  std::generate(v69.begin(), v69.end(), std::ref(f16rng));
+  std::generate(v70.begin(), v70.end(), std::ref(f16rng));
+  std::generate(v71.begin(), v71.end(), std::ref(f16rng));
+  std::generate(v72.begin(), v72.end(), std::ref(f16rng));
+  std::generate(v73.begin(), v73.end(), std::ref(f16rng));
+  std::generate(v74.begin(), v74.end(), std::ref(f16rng));
+  std::generate(v75.begin(), v75.end(), std::ref(f16rng));
+  std::generate(v76.begin(), v76.end(), std::ref(f16rng));
+  std::generate(v77.begin(), v77.end(), std::ref(f16rng));
+  std::generate(v78.begin(), v78.end(), std::ref(f16rng));
+  std::generate(v79.begin(), v79.end(), std::ref(f16rng));
+  std::generate(v80.begin(), v80.end(), std::ref(f16rng));
+  std::generate(v81.begin(), v81.end(), std::ref(f16rng));
+  std::generate(v82.begin(), v82.end(), std::ref(f16rng));
+  std::generate(v83.begin(), v83.end(), std::ref(f16rng));
+  std::generate(v84.begin(), v84.end(), std::ref(f16rng));
+  std::generate(v85.begin(), v85.end(), std::ref(f16rng));
+  std::generate(v86.begin(), v86.end(), std::ref(f16rng));
+  std::generate(v87.begin(), v87.end(), std::ref(f16rng));
+  std::generate(v88.begin(), v88.end(), std::ref(f16rng));
+  std::generate(v89.begin(), v89.end(), std::ref(f16rng));
+  std::generate(v90.begin(), v90.end(), std::ref(f16rng));
+  std::generate(v91.begin(), v91.end(), std::ref(f16rng));
+  std::generate(v92.begin(), v92.end(), std::ref(f16rng));
+  std::generate(v93.begin(), v93.end(), std::ref(f16rng));
+  std::generate(v94.begin(), v94.end(), std::ref(f16rng));
+  std::generate(v95.begin(), v95.end(), std::ref(f16rng));
+  std::generate(v96.begin(), v96.end(), std::ref(f16rng));
+  std::generate(v97.begin(), v97.end(), std::ref(f16rng));
+  std::generate(v98.begin(), v98.end(), std::ref(f16rng));
+  std::generate(v99.begin(), v99.end(), std::ref(f16rng));
+  std::generate(v100.begin(), v100.end(), std::ref(f16rng));
+  std::generate(v101.begin(), v101.end(), std::ref(f16rng));
+  std::generate(v102.begin(), v102.end(), std::ref(f16rng));
+  std::generate(v103.begin(), v103.end(), std::ref(f16rng));
+  std::generate(v104.begin(), v104.end(), std::ref(f16rng));
+  std::generate(v105.begin(), v105.end(), std::ref(f16rng));
+  std::generate(v106.begin(), v106.end(), std::ref(f16rng));
+  std::generate(v107.begin(), v107.end(), std::ref(f16rng));
+  std::generate(v108.begin(), v108.end(), std::ref(f16rng));
+  std::generate(v109.begin(), v109.end(), std::ref(f16rng));
+  std::generate(v110.begin(), v110.end(), std::ref(f16rng));
+  std::generate(v111.begin(), v111.end(), std::ref(f16rng));
+  std::generate(v112.begin(), v112.end(), std::ref(f16rng));
+  std::generate(v113.begin(), v113.end(), std::ref(f16rng));
+  std::generate(w114.begin(), w114.end(), std::ref(f16rng));
+  std::generate(w115.begin(), w115.end(), std::ref(f16rng));
+  std::generate(w116.begin(), w116.end(), std::ref(f16rng));
+  std::generate(w117.begin(), w117.end(), std::ref(f16rng));
+  std::generate(w118.begin(), w118.end(), std::ref(f16rng));
+  std::generate(w119.begin(), w119.end(), std::ref(f16rng));
+  std::generate(w120.begin(), w120.end(), std::ref(f16rng));
+  std::generate(w121.begin(), w121.end(), std::ref(f16rng));
+  std::generate(w122.begin(), w122.end(), std::ref(f16rng));
+  std::generate(w123.begin(), w123.end(), std::ref(f16rng));
+  std::generate(w124.begin(), w124.end(), std::ref(f16rng));
+  std::generate(w125.begin(), w125.end(), std::ref(f16rng));
+  std::generate(w126.begin(), w126.end(), std::ref(f16rng));
+  std::generate(w127.begin(), w127.end(), std::ref(f16rng));
+  std::generate(w128.begin(), w128.end(), std::ref(f16rng));
+  std::generate(w129.begin(), w129.end(), std::ref(f16rng));
+  std::generate(w130.begin(), w130.end(), std::ref(f16rng));
+  std::generate(w131.begin(), w131.end(), std::ref(f16rng));
+  std::generate(w132.begin(), w132.end(), std::ref(f16rng));
+  std::generate(w133.begin(), w133.end(), std::ref(f16rng));
+  std::generate(w134.begin(), w134.end(), std::ref(f16rng));
+  std::generate(w135.begin(), w135.end(), std::ref(f16rng));
+  std::generate(w136.begin(), w136.end(), std::ref(f16rng));
+  std::generate(w137.begin(), w137.end(), std::ref(f16rng));
+  std::generate(w138.begin(), w138.end(), std::ref(f16rng));
+  std::generate(w139.begin(), w139.end(), std::ref(f16rng));
+  std::generate(w140.begin(), w140.end(), std::ref(f16rng));
+  std::generate(w141.begin(), w141.end(), std::ref(f16rng));
+  std::generate(w142.begin(), w142.end(), std::ref(f16rng));
+  std::generate(w143.begin(), w143.end(), std::ref(f16rng));
+  std::generate(w144.begin(), w144.end(), std::ref(f16rng));
+  std::generate(w145.begin(), w145.end(), std::ref(f16rng));
+  std::generate(w146.begin(), w146.end(), std::ref(f16rng));
+  std::generate(w147.begin(), w147.end(), std::ref(f16rng));
+  std::generate(w148.begin(), w148.end(), std::ref(f16rng));
+  std::generate(w149.begin(), w149.end(), std::ref(f16rng));
+  std::generate(w150.begin(), w150.end(), std::ref(f16rng));
+  std::generate(w151.begin(), w151.end(), std::ref(f16rng));
+  std::generate(w152.begin(), w152.end(), std::ref(f16rng));
+  std::generate(w153.begin(), w153.end(), std::ref(f16rng));
+  std::generate(w154.begin(), w154.end(), std::ref(f16rng));
+  std::generate(w155.begin(), w155.end(), std::ref(f16rng));
+  std::generate(w156.begin(), w156.end(), std::ref(f16rng));
+  std::generate(w157.begin(), w157.end(), std::ref(f16rng));
+  std::generate(w158.begin(), w158.end(), std::ref(f16rng));
+  std::generate(w159.begin(), w159.end(), std::ref(f16rng));
+  std::generate(w160.begin(), w160.end(), std::ref(f16rng));
+  std::generate(w161.begin(), w161.end(), std::ref(f16rng));
+  std::generate(w162.begin(), w162.end(), std::ref(f16rng));
+  std::generate(w163.begin(), w163.end(), std::ref(f16rng));
+  std::generate(w164.begin(), w164.end(), std::ref(f16rng));
+  std::generate(w165.begin(), w165.end(), std::ref(f16rng));
+  std::generate(w166.begin(), w166.end(), std::ref(f16rng));
+  std::generate(w167.begin(), w167.end(), std::ref(f16rng));
+  std::generate(w168.begin(), w168.end(), std::ref(f16rng));
+  std::generate(w169.begin(), w169.end(), std::ref(f16rng));
+  std::generate(w170.begin(), w170.end(), std::ref(f16rng));
+  std::generate(w171.begin(), w171.end(), std::ref(f16rng));
+  std::generate(w172.begin(), w172.end(), std::ref(f16rng));
+  std::generate(w173.begin(), w173.end(), std::ref(f16rng));
+  std::generate(w174.begin(), w174.end(), std::ref(f16rng));
+  std::generate(w175.begin(), w175.end(), std::ref(f16rng));
+  std::generate(w176.begin(), w176.end(), std::ref(f16rng));
+  std::generate(w177.begin(), w177.end(), std::ref(f16rng));
+  std::generate(w178.begin(), w178.end(), std::ref(f16rng));
+  std::generate(w179.begin(), w179.end(), std::ref(f16rng));
+  std::generate(w180.begin(), w180.end(), std::ref(f16rng));
+  std::generate(w181.begin(), w181.end(), std::ref(f16rng));
+  std::generate(w182.begin(), w182.end(), std::ref(f16rng));
+  std::generate(w183.begin(), w183.end(), std::ref(f16rng));
+  std::generate(w184.begin(), w184.end(), std::ref(f16rng));
+  std::generate(w185.begin(), w185.end(), std::ref(f16rng));
+  std::generate(w186.begin(), w186.end(), std::ref(f16rng));
+  std::generate(w187.begin(), w187.end(), std::ref(f16rng));
+  std::generate(w188.begin(), w188.end(), std::ref(f16rng));
+  std::generate(w189.begin(), w189.end(), std::ref(f16rng));
+  std::generate(w190.begin(), w190.end(), std::ref(f16rng));
+  std::generate(w191.begin(), w191.end(), std::ref(f16rng));
+  std::generate(w192.begin(), w192.end(), std::ref(f16rng));
+  std::generate(w193.begin(), w193.end(), std::ref(f16rng));
+  std::generate(w194.begin(), w194.end(), std::ref(f16rng));
+  std::generate(w195.begin(), w195.end(), std::ref(f16rng));
+  std::generate(w196.begin(), w196.end(), std::ref(f16rng));
+  std::generate(w197.begin(), w197.end(), std::ref(f16rng));
+  std::generate(w198.begin(), w198.end(), std::ref(f16rng));
+  std::generate(w199.begin(), w199.end(), std::ref(f16rng));
+  std::generate(w200.begin(), w200.end(), std::ref(f16rng));
+  std::generate(w201.begin(), w201.end(), std::ref(f16rng));
+  std::generate(w202.begin(), w202.end(), std::ref(f16rng));
+  std::generate(w203.begin(), w203.end(), std::ref(f16rng));
+  std::generate(w204.begin(), w204.end(), std::ref(f16rng));
+  std::generate(w205.begin(), w205.end(), std::ref(f16rng));
+  std::generate(w206.begin(), w206.end(), std::ref(f16rng));
+  std::generate(w207.begin(), w207.end(), std::ref(f16rng));
+  std::generate(w208.begin(), w208.end(), std::ref(f16rng));
+  std::generate(w209.begin(), w209.end(), std::ref(f16rng));
+  std::generate(w210.begin(), w210.end(), std::ref(f16rng));
+  std::generate(w211.begin(), w211.end(), std::ref(f16rng));
+  std::generate(w212.begin(), w212.end(), std::ref(f16rng));
+  std::generate(w213.begin(), w213.end(), std::ref(f16rng));
+  std::generate(w214.begin(), w214.end(), std::ref(f16rng));
+  std::generate(w215.begin(), w215.end(), std::ref(f16rng));
+  std::generate(w216.begin(), w216.end(), std::ref(f16rng));
+  std::generate(w217.begin(), w217.end(), std::ref(f16rng));
+  std::generate(w218.begin(), w218.end(), std::ref(f16rng));
+  std::generate(w219.begin(), w219.end(), std::ref(f16rng));
+  std::generate(w220.begin(), w220.end(), std::ref(f16rng));
+  std::generate(w221.begin(), w221.end(), std::ref(f16rng));
+  std::generate(w222.begin(), w222.end(), std::ref(f16rng));
+  std::generate(w223.begin(), w223.end(), std::ref(f16rng));
+  std::generate(w224.begin(), w224.end(), std::ref(f16rng));
+  std::generate(w225.begin(), w225.end(), std::ref(f16rng));
+  std::generate(w226.begin(), w226.end(), std::ref(f16rng));
+  std::generate(w227.begin(), w227.end(), std::ref(f16rng));
+  std::generate(w228.begin(), w228.end(), std::ref(f16rng));
+  std::generate(w229.begin(), w229.end(), std::ref(f16rng));
+  std::generate(w230.begin(), w230.end(), std::ref(f16rng));
+  std::generate(w231.begin(), w231.end(), std::ref(f16rng));
+  std::generate(w232.begin(), w232.end(), std::ref(f16rng));
+  std::generate(w233.begin(), w233.end(), std::ref(f16rng));
+  std::generate(w234.begin(), w234.end(), std::ref(f16rng));
+  std::generate(w235.begin(), w235.end(), std::ref(f16rng));
+  std::generate(w236.begin(), w236.end(), std::ref(f16rng));
+  std::generate(w237.begin(), w237.end(), std::ref(f16rng));
+  std::generate(w238.begin(), w238.end(), std::ref(f16rng));
+  std::generate(w239.begin(), w239.end(), std::ref(f16rng));
+  std::generate(w240.begin(), w240.end(), std::ref(f16rng));
+  std::generate(w241.begin(), w241.end(), std::ref(f16rng));
 
   ExecutionPlan operators;
   xnn_status status;
@@ -523,7 +524,7 @@
     16 /* output_channels_per_group */,
     3 /* input pixel stride */,
     16 /* output pixel stride */,
-    w114, w115,
+    w114.data(), w115.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op0);
@@ -558,7 +559,7 @@
     1 /* output_channels_per_group */,
     16 /* input pixel stride */,
     16 /* output pixel stride */,
-    w116, w117,
+    w116.data(), w117.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op2);
@@ -580,7 +581,7 @@
     16 /* output_channels_per_group */,
     16 /* input pixel stride */,
     16 /* output pixel stride */,
-    w118, w119,
+    w118.data(), w119.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op3);
@@ -613,7 +614,7 @@
     64 /* output_channels_per_group */,
     16 /* input pixel stride */,
     64 /* output pixel stride */,
-    w120, w121,
+    w120.data(), w121.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op5);
@@ -635,7 +636,7 @@
     1 /* output_channels_per_group */,
     64 /* input pixel stride */,
     64 /* output pixel stride */,
-    w122, w123,
+    w122.data(), w123.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op6);
@@ -657,7 +658,7 @@
     24 /* output_channels_per_group */,
     64 /* input pixel stride */,
     24 /* output pixel stride */,
-    w124, w125,
+    w124.data(), w125.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op7);
@@ -679,7 +680,7 @@
     72 /* output_channels_per_group */,
     24 /* input pixel stride */,
     72 /* output pixel stride */,
-    w126, w127,
+    w126.data(), w127.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op8);
@@ -701,7 +702,7 @@
     1 /* output_channels_per_group */,
     72 /* input pixel stride */,
     72 /* output pixel stride */,
-    w128, w129,
+    w128.data(), w129.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op9);
@@ -723,7 +724,7 @@
     24 /* output_channels_per_group */,
     72 /* input pixel stride */,
     24 /* output pixel stride */,
-    w130, w131,
+    w130.data(), w131.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op10);
@@ -756,7 +757,7 @@
     72 /* output_channels_per_group */,
     24 /* input pixel stride */,
     72 /* output pixel stride */,
-    w132, w133,
+    w132.data(), w133.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op12);
@@ -778,7 +779,7 @@
     1 /* output_channels_per_group */,
     72 /* input pixel stride */,
     72 /* output pixel stride */,
-    w134, w135,
+    w134.data(), w135.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op13);
@@ -812,7 +813,7 @@
     24 /* output_channels_per_group */,
     72 /* input pixel stride */,
     24 /* output pixel stride */,
-    w136, w137,
+    w136.data(), w137.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op15);
@@ -834,7 +835,7 @@
     72 /* output_channels_per_group */,
     24 /* input pixel stride */,
     72 /* output pixel stride */,
-    w138, w139,
+    w138.data(), w139.data(),
     0.0f /* output min */, +0x1.00014Fp+0 /* output max */,
     0 /* flags */,
     &op16);
@@ -867,7 +868,7 @@
     40 /* output_channels_per_group */,
     72 /* input pixel stride */,
     40 /* output pixel stride */,
-    w140, w141,
+    w140.data(), w141.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op18);
@@ -889,7 +890,7 @@
     120 /* output_channels_per_group */,
     40 /* input pixel stride */,
     120 /* output pixel stride */,
-    w142, w143,
+    w142.data(), w143.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op19);
@@ -911,7 +912,7 @@
     1 /* output_channels_per_group */,
     120 /* input pixel stride */,
     120 /* output pixel stride */,
-    w144, w145,
+    w144.data(), w145.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op20);
@@ -945,7 +946,7 @@
     32 /* output_channels_per_group */,
     120 /* input pixel stride */,
     32 /* output pixel stride */,
-    w146, w147,
+    w146.data(), w147.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op22);
@@ -967,7 +968,7 @@
     120 /* output_channels_per_group */,
     32 /* input pixel stride */,
     120 /* output pixel stride */,
-    w148, w149,
+    w148.data(), w149.data(),
     0.0f /* output min */, +0x1.00014Fp+0 /* output max */,
     0 /* flags */,
     &op23);
@@ -1000,7 +1001,7 @@
     40 /* output_channels_per_group */,
     120 /* input pixel stride */,
     40 /* output pixel stride */,
-    w150, w151,
+    w150.data(), w151.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op25);
@@ -1033,7 +1034,7 @@
     120 /* output_channels_per_group */,
     40 /* input pixel stride */,
     120 /* output pixel stride */,
-    w152, w153,
+    w152.data(), w153.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op27);
@@ -1055,7 +1056,7 @@
     1 /* output_channels_per_group */,
     120 /* input pixel stride */,
     120 /* output pixel stride */,
-    w154, w155,
+    w154.data(), w155.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op28);
@@ -1089,7 +1090,7 @@
     32 /* output_channels_per_group */,
     120 /* input pixel stride */,
     32 /* output pixel stride */,
-    w156, w157,
+    w156.data(), w157.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op30);
@@ -1111,7 +1112,7 @@
     120 /* output_channels_per_group */,
     32 /* input pixel stride */,
     120 /* output pixel stride */,
-    w158, w159,
+    w158.data(), w159.data(),
     0.0f /* output min */, +0x1.00014Fp+0 /* output max */,
     0 /* flags */,
     &op31);
@@ -1144,7 +1145,7 @@
     40 /* output_channels_per_group */,
     120 /* input pixel stride */,
     40 /* output pixel stride */,
-    w160, w161,
+    w160.data(), w161.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op33);
@@ -1177,7 +1178,7 @@
     240 /* output_channels_per_group */,
     40 /* input pixel stride */,
     240 /* output pixel stride */,
-    w162, w163,
+    w162.data(), w163.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op35);
@@ -1212,7 +1213,7 @@
     1 /* output_channels_per_group */,
     240 /* input pixel stride */,
     240 /* output pixel stride */,
-    w164, w165,
+    w164.data(), w165.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op37);
@@ -1247,7 +1248,7 @@
     80 /* output_channels_per_group */,
     240 /* input pixel stride */,
     80 /* output pixel stride */,
-    w166, w167,
+    w166.data(), w167.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op39);
@@ -1269,7 +1270,7 @@
     200 /* output_channels_per_group */,
     80 /* input pixel stride */,
     200 /* output pixel stride */,
-    w168, w169,
+    w168.data(), w169.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op40);
@@ -1304,7 +1305,7 @@
     1 /* output_channels_per_group */,
     200 /* input pixel stride */,
     200 /* output pixel stride */,
-    w170, w171,
+    w170.data(), w171.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op42);
@@ -1339,7 +1340,7 @@
     80 /* output_channels_per_group */,
     200 /* input pixel stride */,
     80 /* output pixel stride */,
-    w172, w173,
+    w172.data(), w173.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op44);
@@ -1372,7 +1373,7 @@
     184 /* output_channels_per_group */,
     80 /* input pixel stride */,
     184 /* output pixel stride */,
-    w174, w175,
+    w174.data(), w175.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op46);
@@ -1407,7 +1408,7 @@
     1 /* output_channels_per_group */,
     184 /* input pixel stride */,
     184 /* output pixel stride */,
-    w176, w177,
+    w176.data(), w177.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op48);
@@ -1442,7 +1443,7 @@
     80 /* output_channels_per_group */,
     184 /* input pixel stride */,
     80 /* output pixel stride */,
-    w178, w179,
+    w178.data(), w179.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op50);
@@ -1475,7 +1476,7 @@
     184 /* output_channels_per_group */,
     80 /* input pixel stride */,
     184 /* output pixel stride */,
-    w180, w181,
+    w180.data(), w181.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op52);
@@ -1510,7 +1511,7 @@
     1 /* output_channels_per_group */,
     184 /* input pixel stride */,
     184 /* output pixel stride */,
-    w182, w183,
+    w182.data(), w183.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op54);
@@ -1545,7 +1546,7 @@
     80 /* output_channels_per_group */,
     184 /* input pixel stride */,
     80 /* output pixel stride */,
-    w184, w185,
+    w184.data(), w185.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op56);
@@ -1578,7 +1579,7 @@
     480 /* output_channels_per_group */,
     80 /* input pixel stride */,
     480 /* output pixel stride */,
-    w186, w187,
+    w186.data(), w187.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op58);
@@ -1613,7 +1614,7 @@
     1 /* output_channels_per_group */,
     480 /* input pixel stride */,
     480 /* output pixel stride */,
-    w188, w189,
+    w188.data(), w189.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op60);
@@ -1660,7 +1661,7 @@
     120 /* output_channels_per_group */,
     480 /* input pixel stride */,
     120 /* output pixel stride */,
-    w190, w191,
+    w190.data(), w191.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op63);
@@ -1682,7 +1683,7 @@
     480 /* output_channels_per_group */,
     120 /* input pixel stride */,
     480 /* output pixel stride */,
-    w192, w193,
+    w192.data(), w193.data(),
     0.0f /* output min */, +0x1.00014Fp+0 /* output max */,
     0 /* flags */,
     &op64);
@@ -1715,7 +1716,7 @@
     112 /* output_channels_per_group */,
     480 /* input pixel stride */,
     112 /* output pixel stride */,
-    w194, w195,
+    w194.data(), w195.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op66);
@@ -1737,7 +1738,7 @@
     672 /* output_channels_per_group */,
     112 /* input pixel stride */,
     672 /* output pixel stride */,
-    w196, w197,
+    w196.data(), w197.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op67);
@@ -1772,7 +1773,7 @@
     1 /* output_channels_per_group */,
     672 /* input pixel stride */,
     672 /* output pixel stride */,
-    w198, w199,
+    w198.data(), w199.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op69);
@@ -1819,7 +1820,7 @@
     168 /* output_channels_per_group */,
     672 /* input pixel stride */,
     168 /* output pixel stride */,
-    w200, w201,
+    w200.data(), w201.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op72);
@@ -1841,7 +1842,7 @@
     672 /* output_channels_per_group */,
     168 /* input pixel stride */,
     672 /* output pixel stride */,
-    w202, w203,
+    w202.data(), w203.data(),
     0.0f /* output min */, +0x1.00014Fp+0 /* output max */,
     0 /* flags */,
     &op73);
@@ -1874,7 +1875,7 @@
     112 /* output_channels_per_group */,
     672 /* input pixel stride */,
     112 /* output pixel stride */,
-    w204, w205,
+    w204.data(), w205.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op75);
@@ -1907,7 +1908,7 @@
     672 /* output_channels_per_group */,
     112 /* input pixel stride */,
     672 /* output pixel stride */,
-    w206, w207,
+    w206.data(), w207.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op77);
@@ -1942,7 +1943,7 @@
     1 /* output_channels_per_group */,
     672 /* input pixel stride */,
     672 /* output pixel stride */,
-    w208, w209,
+    w208.data(), w209.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op79);
@@ -1989,7 +1990,7 @@
     168 /* output_channels_per_group */,
     672 /* input pixel stride */,
     168 /* output pixel stride */,
-    w210, w211,
+    w210.data(), w211.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op82);
@@ -2011,7 +2012,7 @@
     672 /* output_channels_per_group */,
     168 /* input pixel stride */,
     672 /* output pixel stride */,
-    w212, w213,
+    w212.data(), w213.data(),
     0.0f /* output min */, +0x1.00014Fp+0 /* output max */,
     0 /* flags */,
     &op83);
@@ -2044,7 +2045,7 @@
     160 /* output_channels_per_group */,
     672 /* input pixel stride */,
     160 /* output pixel stride */,
-    w214, w215,
+    w214.data(), w215.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op85);
@@ -2066,7 +2067,7 @@
     960 /* output_channels_per_group */,
     160 /* input pixel stride */,
     960 /* output pixel stride */,
-    w216, w217,
+    w216.data(), w217.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op86);
@@ -2101,7 +2102,7 @@
     1 /* output_channels_per_group */,
     960 /* input pixel stride */,
     960 /* output pixel stride */,
-    w218, w219,
+    w218.data(), w219.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op88);
@@ -2148,7 +2149,7 @@
     240 /* output_channels_per_group */,
     960 /* input pixel stride */,
     240 /* output pixel stride */,
-    w220, w221,
+    w220.data(), w221.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op91);
@@ -2170,7 +2171,7 @@
     960 /* output_channels_per_group */,
     240 /* input pixel stride */,
     960 /* output pixel stride */,
-    w222, w223,
+    w222.data(), w223.data(),
     0.0f /* output min */, +0x1.00014Fp+0 /* output max */,
     0 /* flags */,
     &op92);
@@ -2203,7 +2204,7 @@
     160 /* output_channels_per_group */,
     960 /* input pixel stride */,
     160 /* output pixel stride */,
-    w224, w225,
+    w224.data(), w225.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op94);
@@ -2236,7 +2237,7 @@
     960 /* output_channels_per_group */,
     160 /* input pixel stride */,
     960 /* output pixel stride */,
-    w226, w227,
+    w226.data(), w227.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op96);
@@ -2271,7 +2272,7 @@
     1 /* output_channels_per_group */,
     960 /* input pixel stride */,
     960 /* output pixel stride */,
-    w228, w229,
+    w228.data(), w229.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op98);
@@ -2318,7 +2319,7 @@
     240 /* output_channels_per_group */,
     960 /* input pixel stride */,
     240 /* output pixel stride */,
-    w230, w231,
+    w230.data(), w231.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op101);
@@ -2340,7 +2341,7 @@
     960 /* output_channels_per_group */,
     240 /* input pixel stride */,
     960 /* output pixel stride */,
-    w232, w233,
+    w232.data(), w233.data(),
     0.0f /* output min */, +0x1.00014Fp+0 /* output max */,
     0 /* flags */,
     &op102);
@@ -2373,7 +2374,7 @@
     160 /* output_channels_per_group */,
     960 /* input pixel stride */,
     160 /* output pixel stride */,
-    w234, w235,
+    w234.data(), w235.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op104);
@@ -2406,7 +2407,7 @@
     960 /* output_channels_per_group */,
     160 /* input pixel stride */,
     960 /* output pixel stride */,
-    w236, w237,
+    w236.data(), w237.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op106);
@@ -2453,7 +2454,7 @@
     1280 /* output_channels_per_group */,
     960 /* input pixel stride */,
     1280 /* output pixel stride */,
-    w238, w239,
+    w238.data(), w239.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op109);
@@ -2500,7 +2501,7 @@
     1001 /* output_channels_per_group */,
     1280 /* input pixel stride */,
     1001 /* output pixel stride */,
-    w240, w241,
+    w240.data(), w241.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op112);
@@ -2515,7 +2516,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op0,
     1 /* batch size */, 224 /* input height */, 224 /* input width */,
-    v0 /* input */, v1 /* output */,
+    v0.data() /* input */, v1.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #0" << std::endl;
@@ -2525,7 +2526,7 @@
   status = xnn_setup_hardswish_nc_f16(
     op1,
     12544 /* batch size */,
-    v1 /* input */, v2 /* output */,
+    v1.data() /* input */, v2.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #1" << std::endl;
@@ -2535,7 +2536,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op2,
     1 /* batch size */, 112 /* input height */, 112 /* input width */,
-    v2 /* input */, v3 /* output */,
+    v2.data() /* input */, v3.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #2" << std::endl;
@@ -2545,7 +2546,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op3,
     1 /* batch size */, 112 /* input height */, 112 /* input width */,
-    v3 /* input */, v4 /* output */,
+    v3.data() /* input */, v4.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #3" << std::endl;
@@ -2558,7 +2559,7 @@
     status = xnn_setup_add_nd_f16(
       op4,
       4, a_shape, 4, b_shape,
-      v4 /* a */, v2 /* b */, v5 /* output */,
+      v4.data() /* a */, v2.data() /* b */, v5.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2569,7 +2570,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op5,
     1 /* batch size */, 112 /* input height */, 112 /* input width */,
-    v5 /* input */, v6 /* output */,
+    v5.data() /* input */, v6.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #5" << std::endl;
@@ -2579,7 +2580,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op6,
     1 /* batch size */, 112 /* input height */, 112 /* input width */,
-    v6 /* input */, v7 /* output */,
+    v6.data() /* input */, v7.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #6" << std::endl;
@@ -2589,7 +2590,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op7,
     1 /* batch size */, 56 /* input height */, 56 /* input width */,
-    v7 /* input */, v8 /* output */,
+    v7.data() /* input */, v8.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #7" << std::endl;
@@ -2599,7 +2600,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op8,
     1 /* batch size */, 56 /* input height */, 56 /* input width */,
-    v8 /* input */, v9 /* output */,
+    v8.data() /* input */, v9.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #8" << std::endl;
@@ -2609,7 +2610,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op9,
     1 /* batch size */, 56 /* input height */, 56 /* input width */,
-    v9 /* input */, v10 /* output */,
+    v9.data() /* input */, v10.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #9" << std::endl;
@@ -2619,7 +2620,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op10,
     1 /* batch size */, 56 /* input height */, 56 /* input width */,
-    v10 /* input */, v11 /* output */,
+    v10.data() /* input */, v11.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #10" << std::endl;
@@ -2632,7 +2633,7 @@
     status = xnn_setup_add_nd_f16(
       op11,
       4, a_shape, 4, b_shape,
-      v11 /* a */, v8 /* b */, v12 /* output */,
+      v11.data() /* a */, v8.data() /* b */, v12.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2643,7 +2644,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op12,
     1 /* batch size */, 56 /* input height */, 56 /* input width */,
-    v12 /* input */, v13 /* output */,
+    v12.data() /* input */, v13.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #12" << std::endl;
@@ -2653,7 +2654,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op13,
     1 /* batch size */, 56 /* input height */, 56 /* input width */,
-    v13 /* input */, v14 /* output */,
+    v13.data() /* input */, v14.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #13" << std::endl;
@@ -2663,7 +2664,7 @@
   status = xnn_setup_global_average_pooling_nwc_f16(
     op14,
     1 /* batch size */, 784 /* width */,
-    v14 /* input */, v15 /* output */,
+    v14.data() /* input */, v15.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #14" << std::endl;
@@ -2673,7 +2674,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op15,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v15 /* input */, v16 /* output */,
+    v15.data() /* input */, v16.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #15" << std::endl;
@@ -2683,7 +2684,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op16,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v16 /* input */, v17 /* output */,
+    v16.data() /* input */, v17.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #16" << std::endl;
@@ -2696,7 +2697,7 @@
     status = xnn_setup_multiply_nd_f16(
       op17,
       4, a_shape, 4, b_shape,
-      v14 /* a */, v17 /* b */, v18 /* output */,
+      v14.data() /* a */, v17.data() /* b */, v18.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2707,7 +2708,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op18,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v18 /* input */, v19 /* output */,
+    v18.data() /* input */, v19.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #18" << std::endl;
@@ -2717,7 +2718,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op19,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v19 /* input */, v20 /* output */,
+    v19.data() /* input */, v20.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #19" << std::endl;
@@ -2727,7 +2728,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op20,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v20 /* input */, v21 /* output */,
+    v20.data() /* input */, v21.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #20" << std::endl;
@@ -2737,7 +2738,7 @@
   status = xnn_setup_global_average_pooling_nwc_f16(
     op21,
     1 /* batch size */, 784 /* width */,
-    v21 /* input */, v22 /* output */,
+    v21.data() /* input */, v22.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #21" << std::endl;
@@ -2747,7 +2748,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op22,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v22 /* input */, v23 /* output */,
+    v22.data() /* input */, v23.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #22" << std::endl;
@@ -2757,7 +2758,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op23,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v23 /* input */, v24 /* output */,
+    v23.data() /* input */, v24.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #23" << std::endl;
@@ -2770,7 +2771,7 @@
     status = xnn_setup_multiply_nd_f16(
       op24,
       4, a_shape, 4, b_shape,
-      v21 /* a */, v24 /* b */, v25 /* output */,
+      v21.data() /* a */, v24.data() /* b */, v25.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2781,7 +2782,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op25,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v25 /* input */, v26 /* output */,
+    v25.data() /* input */, v26.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #25" << std::endl;
@@ -2794,7 +2795,7 @@
     status = xnn_setup_add_nd_f16(
       op26,
       4, a_shape, 4, b_shape,
-      v26 /* a */, v19 /* b */, v27 /* output */,
+      v26.data() /* a */, v19.data() /* b */, v27.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2805,7 +2806,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op27,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v27 /* input */, v28 /* output */,
+    v27.data() /* input */, v28.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #27" << std::endl;
@@ -2815,7 +2816,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op28,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v28 /* input */, v29 /* output */,
+    v28.data() /* input */, v29.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #28" << std::endl;
@@ -2825,7 +2826,7 @@
   status = xnn_setup_global_average_pooling_nwc_f16(
     op29,
     1 /* batch size */, 784 /* width */,
-    v29 /* input */, v30 /* output */,
+    v29.data() /* input */, v30.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #29" << std::endl;
@@ -2835,7 +2836,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op30,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v30 /* input */, v31 /* output */,
+    v30.data() /* input */, v31.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #30" << std::endl;
@@ -2845,7 +2846,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op31,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v31 /* input */, v32 /* output */,
+    v31.data() /* input */, v32.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #31" << std::endl;
@@ -2858,7 +2859,7 @@
     status = xnn_setup_multiply_nd_f16(
       op32,
       4, a_shape, 4, b_shape,
-      v29 /* a */, v32 /* b */, v33 /* output */,
+      v29.data() /* a */, v32.data() /* b */, v33.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2869,7 +2870,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op33,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v33 /* input */, v34 /* output */,
+    v33.data() /* input */, v34.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #33" << std::endl;
@@ -2882,7 +2883,7 @@
     status = xnn_setup_add_nd_f16(
       op34,
       4, a_shape, 4, b_shape,
-      v34 /* a */, v27 /* b */, v35 /* output */,
+      v34.data() /* a */, v27.data() /* b */, v35.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2893,7 +2894,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op35,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v35 /* input */, v36 /* output */,
+    v35.data() /* input */, v36.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #35" << std::endl;
@@ -2903,7 +2904,7 @@
   status = xnn_setup_hardswish_nc_f16(
     op36,
     784 /* batch size */,
-    v36 /* input */, v37 /* output */,
+    v36.data() /* input */, v37.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #36" << std::endl;
@@ -2913,7 +2914,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op37,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v37 /* input */, v38 /* output */,
+    v37.data() /* input */, v38.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #37" << std::endl;
@@ -2923,7 +2924,7 @@
   status = xnn_setup_hardswish_nc_f16(
     op38,
     196 /* batch size */,
-    v38 /* input */, v39 /* output */,
+    v38.data() /* input */, v39.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #38" << std::endl;
@@ -2933,7 +2934,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op39,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v39 /* input */, v40 /* output */,
+    v39.data() /* input */, v40.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #39" << std::endl;
@@ -2943,7 +2944,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op40,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v40 /* input */, v41 /* output */,
+    v40.data() /* input */, v41.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #40" << std::endl;
@@ -2953,7 +2954,7 @@
   status = xnn_setup_hardswish_nc_f16(
     op41,
     196 /* batch size */,
-    v41 /* input */, v42 /* output */,
+    v41.data() /* input */, v42.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #41" << std::endl;
@@ -2963,7 +2964,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op42,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v42 /* input */, v43 /* output */,
+    v42.data() /* input */, v43.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #42" << std::endl;
@@ -2973,7 +2974,7 @@
   status = xnn_setup_hardswish_nc_f16(
     op43,
     196 /* batch size */,
-    v43 /* input */, v44 /* output */,
+    v43.data() /* input */, v44.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #43" << std::endl;
@@ -2983,7 +2984,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op44,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v44 /* input */, v45 /* output */,
+    v44.data() /* input */, v45.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #44" << std::endl;
@@ -2996,7 +2997,7 @@
     status = xnn_setup_add_nd_f16(
       op45,
       4, a_shape, 4, b_shape,
-      v45 /* a */, v40 /* b */, v46 /* output */,
+      v45.data() /* a */, v40.data() /* b */, v46.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -3007,7 +3008,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op46,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v46 /* input */, v47 /* output */,
+    v46.data() /* input */, v47.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #46" << std::endl;
@@ -3017,7 +3018,7 @@
   status = xnn_setup_hardswish_nc_f16(
     op47,
     196 /* batch size */,
-    v47 /* input */, v48 /* output */,
+    v47.data() /* input */, v48.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #47" << std::endl;
@@ -3027,7 +3028,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op48,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v48 /* input */, v49 /* output */,
+    v48.data() /* input */, v49.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #48" << std::endl;
@@ -3037,7 +3038,7 @@
   status = xnn_setup_hardswish_nc_f16(
     op49,
     196 /* batch size */,
-    v49 /* input */, v50 /* output */,
+    v49.data() /* input */, v50.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #49" << std::endl;
@@ -3047,7 +3048,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op50,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v50 /* input */, v51 /* output */,
+    v50.data() /* input */, v51.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #50" << std::endl;
@@ -3060,7 +3061,7 @@
     status = xnn_setup_add_nd_f16(
       op51,
       4, a_shape, 4, b_shape,
-      v51 /* a */, v46 /* b */, v52 /* output */,
+      v51.data() /* a */, v46.data() /* b */, v52.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -3071,7 +3072,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op52,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v52 /* input */, v53 /* output */,
+    v52.data() /* input */, v53.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #52" << std::endl;
@@ -3081,7 +3082,7 @@
   status = xnn_setup_hardswish_nc_f16(
     op53,
     196 /* batch size */,
-    v53 /* input */, v54 /* output */,
+    v53.data() /* input */, v54.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #53" << std::endl;
@@ -3091,7 +3092,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op54,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v54 /* input */, v55 /* output */,
+    v54.data() /* input */, v55.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #54" << std::endl;
@@ -3101,7 +3102,7 @@
   status = xnn_setup_hardswish_nc_f16(
     op55,
     196 /* batch size */,
-    v55 /* input */, v56 /* output */,
+    v55.data() /* input */, v56.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #55" << std::endl;
@@ -3111,7 +3112,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op56,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v56 /* input */, v57 /* output */,
+    v56.data() /* input */, v57.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #56" << std::endl;
@@ -3124,7 +3125,7 @@
     status = xnn_setup_add_nd_f16(
       op57,
       4, a_shape, 4, b_shape,
-      v57 /* a */, v52 /* b */, v58 /* output */,
+      v57.data() /* a */, v52.data() /* b */, v58.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -3135,7 +3136,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op58,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v58 /* input */, v59 /* output */,
+    v58.data() /* input */, v59.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #58" << std::endl;
@@ -3145,7 +3146,7 @@
   status = xnn_setup_hardswish_nc_f16(
     op59,
     196 /* batch size */,
-    v59 /* input */, v60 /* output */,
+    v59.data() /* input */, v60.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #59" << std::endl;
@@ -3155,7 +3156,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op60,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v60 /* input */, v61 /* output */,
+    v60.data() /* input */, v61.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #60" << std::endl;
@@ -3165,7 +3166,7 @@
   status = xnn_setup_hardswish_nc_f16(
     op61,
     196 /* batch size */,
-    v61 /* input */, v62 /* output */,
+    v61.data() /* input */, v62.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #61" << std::endl;
@@ -3175,7 +3176,7 @@
   status = xnn_setup_global_average_pooling_nwc_f16(
     op62,
     1 /* batch size */, 196 /* width */,
-    v62 /* input */, v63 /* output */,
+    v62.data() /* input */, v63.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #62" << std::endl;
@@ -3185,7 +3186,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op63,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v63 /* input */, v64 /* output */,
+    v63.data() /* input */, v64.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #63" << std::endl;
@@ -3195,7 +3196,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op64,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v64 /* input */, v65 /* output */,
+    v64.data() /* input */, v65.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #64" << std::endl;
@@ -3208,7 +3209,7 @@
     status = xnn_setup_multiply_nd_f16(
       op65,
       4, a_shape, 4, b_shape,
-      v62 /* a */, v65 /* b */, v66 /* output */,
+      v62.data() /* a */, v65.data() /* b */, v66.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -3219,7 +3220,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op66,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v66 /* input */, v67 /* output */,
+    v66.data() /* input */, v67.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #66" << std::endl;
@@ -3229,7 +3230,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op67,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v67 /* input */, v68 /* output */,
+    v67.data() /* input */, v68.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #67" << std::endl;
@@ -3239,7 +3240,7 @@
   status = xnn_setup_hardswish_nc_f16(
     op68,
     196 /* batch size */,
-    v68 /* input */, v69 /* output */,
+    v68.data() /* input */, v69.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #68" << std::endl;
@@ -3249,7 +3250,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op69,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v69 /* input */, v70 /* output */,
+    v69.data() /* input */, v70.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #69" << std::endl;
@@ -3259,7 +3260,7 @@
   status = xnn_setup_hardswish_nc_f16(
     op70,
     196 /* batch size */,
-    v70 /* input */, v71 /* output */,
+    v70.data() /* input */, v71.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #70" << std::endl;
@@ -3269,7 +3270,7 @@
   status = xnn_setup_global_average_pooling_nwc_f16(
     op71,
     1 /* batch size */, 196 /* width */,
-    v71 /* input */, v72 /* output */,
+    v71.data() /* input */, v72.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #71" << std::endl;
@@ -3279,7 +3280,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op72,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v72 /* input */, v73 /* output */,
+    v72.data() /* input */, v73.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #72" << std::endl;
@@ -3289,7 +3290,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op73,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v73 /* input */, v74 /* output */,
+    v73.data() /* input */, v74.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #73" << std::endl;
@@ -3302,7 +3303,7 @@
     status = xnn_setup_multiply_nd_f16(
       op74,
       4, a_shape, 4, b_shape,
-      v71 /* a */, v74 /* b */, v75 /* output */,
+      v71.data() /* a */, v74.data() /* b */, v75.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -3313,7 +3314,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op75,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v75 /* input */, v76 /* output */,
+    v75.data() /* input */, v76.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #75" << std::endl;
@@ -3326,7 +3327,7 @@
     status = xnn_setup_add_nd_f16(
       op76,
       4, a_shape, 4, b_shape,
-      v76 /* a */, v67 /* b */, v77 /* output */,
+      v76.data() /* a */, v67.data() /* b */, v77.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -3337,7 +3338,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op77,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v77 /* input */, v78 /* output */,
+    v77.data() /* input */, v78.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #77" << std::endl;
@@ -3347,7 +3348,7 @@
   status = xnn_setup_hardswish_nc_f16(
     op78,
     196 /* batch size */,
-    v78 /* input */, v79 /* output */,
+    v78.data() /* input */, v79.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #78" << std::endl;
@@ -3357,7 +3358,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op79,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v79 /* input */, v80 /* output */,
+    v79.data() /* input */, v80.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #79" << std::endl;
@@ -3367,7 +3368,7 @@
   status = xnn_setup_hardswish_nc_f16(
     op80,
     49 /* batch size */,
-    v80 /* input */, v81 /* output */,
+    v80.data() /* input */, v81.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #80" << std::endl;
@@ -3377,7 +3378,7 @@
   status = xnn_setup_global_average_pooling_nwc_f16(
     op81,
     1 /* batch size */, 49 /* width */,
-    v81 /* input */, v82 /* output */,
+    v81.data() /* input */, v82.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #81" << std::endl;
@@ -3387,7 +3388,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op82,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v82 /* input */, v83 /* output */,
+    v82.data() /* input */, v83.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #82" << std::endl;
@@ -3397,7 +3398,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op83,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v83 /* input */, v84 /* output */,
+    v83.data() /* input */, v84.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #83" << std::endl;
@@ -3410,7 +3411,7 @@
     status = xnn_setup_multiply_nd_f16(
       op84,
       4, a_shape, 4, b_shape,
-      v81 /* a */, v84 /* b */, v85 /* output */,
+      v81.data() /* a */, v84.data() /* b */, v85.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -3421,7 +3422,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op85,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v85 /* input */, v86 /* output */,
+    v85.data() /* input */, v86.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #85" << std::endl;
@@ -3431,7 +3432,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op86,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v86 /* input */, v87 /* output */,
+    v86.data() /* input */, v87.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #86" << std::endl;
@@ -3441,7 +3442,7 @@
   status = xnn_setup_hardswish_nc_f16(
     op87,
     49 /* batch size */,
-    v87 /* input */, v88 /* output */,
+    v87.data() /* input */, v88.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #87" << std::endl;
@@ -3451,7 +3452,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op88,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v88 /* input */, v89 /* output */,
+    v88.data() /* input */, v89.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #88" << std::endl;
@@ -3461,7 +3462,7 @@
   status = xnn_setup_hardswish_nc_f16(
     op89,
     49 /* batch size */,
-    v89 /* input */, v90 /* output */,
+    v89.data() /* input */, v90.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #89" << std::endl;
@@ -3471,7 +3472,7 @@
   status = xnn_setup_global_average_pooling_nwc_f16(
     op90,
     1 /* batch size */, 49 /* width */,
-    v90 /* input */, v91 /* output */,
+    v90.data() /* input */, v91.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #90" << std::endl;
@@ -3481,7 +3482,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op91,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v91 /* input */, v92 /* output */,
+    v91.data() /* input */, v92.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #91" << std::endl;
@@ -3491,7 +3492,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op92,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v92 /* input */, v93 /* output */,
+    v92.data() /* input */, v93.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #92" << std::endl;
@@ -3504,7 +3505,7 @@
     status = xnn_setup_multiply_nd_f16(
       op93,
       4, a_shape, 4, b_shape,
-      v90 /* a */, v93 /* b */, v94 /* output */,
+      v90.data() /* a */, v93.data() /* b */, v94.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -3515,7 +3516,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op94,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v94 /* input */, v95 /* output */,
+    v94.data() /* input */, v95.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #94" << std::endl;
@@ -3528,7 +3529,7 @@
     status = xnn_setup_add_nd_f16(
       op95,
       4, a_shape, 4, b_shape,
-      v95 /* a */, v86 /* b */, v96 /* output */,
+      v95.data() /* a */, v86.data() /* b */, v96.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -3539,7 +3540,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op96,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v96 /* input */, v97 /* output */,
+    v96.data() /* input */, v97.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #96" << std::endl;
@@ -3549,7 +3550,7 @@
   status = xnn_setup_hardswish_nc_f16(
     op97,
     49 /* batch size */,
-    v97 /* input */, v98 /* output */,
+    v97.data() /* input */, v98.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #97" << std::endl;
@@ -3559,7 +3560,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op98,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v98 /* input */, v99 /* output */,
+    v98.data() /* input */, v99.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #98" << std::endl;
@@ -3569,7 +3570,7 @@
   status = xnn_setup_hardswish_nc_f16(
     op99,
     49 /* batch size */,
-    v99 /* input */, v100 /* output */,
+    v99.data() /* input */, v100.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #99" << std::endl;
@@ -3579,7 +3580,7 @@
   status = xnn_setup_global_average_pooling_nwc_f16(
     op100,
     1 /* batch size */, 49 /* width */,
-    v100 /* input */, v101 /* output */,
+    v100.data() /* input */, v101.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #100" << std::endl;
@@ -3589,7 +3590,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op101,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v101 /* input */, v102 /* output */,
+    v101.data() /* input */, v102.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #101" << std::endl;
@@ -3599,7 +3600,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op102,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v102 /* input */, v103 /* output */,
+    v102.data() /* input */, v103.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #102" << std::endl;
@@ -3612,7 +3613,7 @@
     status = xnn_setup_multiply_nd_f16(
       op103,
       4, a_shape, 4, b_shape,
-      v100 /* a */, v103 /* b */, v104 /* output */,
+      v100.data() /* a */, v103.data() /* b */, v104.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -3623,7 +3624,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op104,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v104 /* input */, v105 /* output */,
+    v104.data() /* input */, v105.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #104" << std::endl;
@@ -3636,7 +3637,7 @@
     status = xnn_setup_add_nd_f16(
       op105,
       4, a_shape, 4, b_shape,
-      v105 /* a */, v96 /* b */, v106 /* output */,
+      v105.data() /* a */, v96.data() /* b */, v106.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -3647,7 +3648,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op106,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v106 /* input */, v107 /* output */,
+    v106.data() /* input */, v107.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #106" << std::endl;
@@ -3657,7 +3658,7 @@
   status = xnn_setup_hardswish_nc_f16(
     op107,
     49 /* batch size */,
-    v107 /* input */, v108 /* output */,
+    v107.data() /* input */, v108.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #107" << std::endl;
@@ -3667,7 +3668,7 @@
   status = xnn_setup_global_average_pooling_nwc_f16(
     op108,
     1 /* batch size */, 49 /* width */,
-    v108 /* input */, v109 /* output */,
+    v108.data() /* input */, v109.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #108" << std::endl;
@@ -3677,7 +3678,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op109,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v109 /* input */, v110 /* output */,
+    v109.data() /* input */, v110.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #109" << std::endl;
@@ -3687,7 +3688,7 @@
   status = xnn_setup_hardswish_nc_f16(
     op110,
     1 /* batch size */,
-    v110 /* input */, v111 /* output */,
+    v110.data() /* input */, v111.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #110" << std::endl;
@@ -3697,7 +3698,7 @@
   status = xnn_setup_global_average_pooling_nwc_f16(
     op111,
     1 /* batch size */, 1 /* width */,
-    v111 /* input */, v112 /* output */,
+    v111.data() /* input */, v112.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #111" << std::endl;
@@ -3707,7 +3708,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op112,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v112 /* input */, v113 /* output */,
+    v112.data() /* input */, v113.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #112" << std::endl;
diff --git a/models/fp16-mobilenet-v3-small.cc b/models/fp16-mobilenet-v3-small.cc
index d78ae37..877673f 100644
--- a/models/fp16-mobilenet-v3-small.cc
+++ b/models/fp16-mobilenet-v3-small.cc
@@ -5,6 +5,7 @@
 
 #include <xnnpack.h>
 
+#include <array>
 #include <algorithm>
 #include <functional>
 #include <iostream>
@@ -18,427 +19,427 @@
 namespace models {
 
 ExecutionPlan FP16MobileNetV3Small(pthreadpool_t threadpool) {
-  alignas(16) static uint16_t v0[150528];
-  alignas(16) static uint16_t v1[200704];
-  alignas(16) static uint16_t v2[200704];
-  alignas(16) static uint16_t v3[50176];
-  alignas(16) static uint16_t v4[16];
-  alignas(16) static uint16_t v5[8];
-  alignas(16) static uint16_t v6[16];
-  alignas(16) static uint16_t v7[50176];
-  alignas(16) static uint16_t v8[50176];
-  alignas(16) static uint16_t v9[225792];
-  alignas(16) static uint16_t v10[56448];
-  alignas(16) static uint16_t v11[18816];
-  alignas(16) static uint16_t v12[68992];
-  alignas(16) static uint16_t v13[68992];
-  alignas(16) static uint16_t v14[18816];
-  alignas(16) static uint16_t v15[18816];
-  alignas(16) static uint16_t v16[75264];
-  alignas(16) static uint16_t v17[75264];
-  alignas(16) static uint16_t v18[18816];
-  alignas(16) static uint16_t v19[18816];
-  alignas(16) static uint16_t v20[96];
-  alignas(16) static uint16_t v21[24];
-  alignas(16) static uint16_t v22[96];
-  alignas(16) static uint16_t v23[18816];
-  alignas(16) static uint16_t v24[7840];
-  alignas(16) static uint16_t v25[47040];
-  alignas(16) static uint16_t v26[47040];
-  alignas(16) static uint16_t v27[47040];
-  alignas(16) static uint16_t v28[47040];
-  alignas(16) static uint16_t v29[240];
-  alignas(16) static uint16_t v30[64];
-  alignas(16) static uint16_t v31[240];
-  alignas(16) static uint16_t v32[47040];
-  alignas(16) static uint16_t v33[7840];
-  alignas(16) static uint16_t v34[7840];
-  alignas(16) static uint16_t v35[47040];
-  alignas(16) static uint16_t v36[47040];
-  alignas(16) static uint16_t v37[47040];
-  alignas(16) static uint16_t v38[47040];
-  alignas(16) static uint16_t v39[240];
-  alignas(16) static uint16_t v40[64];
-  alignas(16) static uint16_t v41[240];
-  alignas(16) static uint16_t v42[47040];
-  alignas(16) static uint16_t v43[7840];
-  alignas(16) static uint16_t v44[7840];
-  alignas(16) static uint16_t v45[23520];
-  alignas(16) static uint16_t v46[23520];
-  alignas(16) static uint16_t v47[23520];
-  alignas(16) static uint16_t v48[23520];
-  alignas(16) static uint16_t v49[120];
-  alignas(16) static uint16_t v50[32];
-  alignas(16) static uint16_t v51[120];
-  alignas(16) static uint16_t v52[23520];
-  alignas(16) static uint16_t v53[9408];
-  alignas(16) static uint16_t v54[28224];
-  alignas(16) static uint16_t v55[28224];
-  alignas(16) static uint16_t v56[28224];
-  alignas(16) static uint16_t v57[28224];
-  alignas(16) static uint16_t v58[144];
-  alignas(16) static uint16_t v59[40];
-  alignas(16) static uint16_t v60[144];
-  alignas(16) static uint16_t v61[28224];
-  alignas(16) static uint16_t v62[9408];
-  alignas(16) static uint16_t v63[9408];
-  alignas(16) static uint16_t v64[56448];
-  alignas(16) static uint16_t v65[56448];
-  alignas(16) static uint16_t v66[14112];
-  alignas(16) static uint16_t v67[14112];
-  alignas(16) static uint16_t v68[288];
-  alignas(16) static uint16_t v69[72];
-  alignas(16) static uint16_t v70[288];
-  alignas(16) static uint16_t v71[14112];
-  alignas(16) static uint16_t v72[4704];
-  alignas(16) static uint16_t v73[28224];
-  alignas(16) static uint16_t v74[28224];
-  alignas(16) static uint16_t v75[28224];
-  alignas(16) static uint16_t v76[28224];
-  alignas(16) static uint16_t v77[576];
-  alignas(16) static uint16_t v78[144];
-  alignas(16) static uint16_t v79[576];
-  alignas(16) static uint16_t v80[28224];
-  alignas(16) static uint16_t v81[4704];
-  alignas(16) static uint16_t v82[4704];
-  alignas(16) static uint16_t v83[28224];
-  alignas(16) static uint16_t v84[28224];
-  alignas(16) static uint16_t v85[28224];
-  alignas(16) static uint16_t v86[28224];
-  alignas(16) static uint16_t v87[576];
-  alignas(16) static uint16_t v88[144];
-  alignas(16) static uint16_t v89[576];
-  alignas(16) static uint16_t v90[28224];
-  alignas(16) static uint16_t v91[4704];
-  alignas(16) static uint16_t v92[4704];
-  alignas(16) static uint16_t v93[28224];
-  alignas(16) static uint16_t v94[28224];
-  alignas(16) static uint16_t v95[576];
-  alignas(16) static uint16_t v96[1024];
-  alignas(16) static uint16_t v97[1024];
-  alignas(16) static uint16_t v98[1024];
-  alignas(16) static uint16_t v99[1001];
-  alignas(16) static uint16_t w100[432];
-  alignas(16) static uint16_t w101[16];
-  alignas(16) static uint16_t w102[144];
-  alignas(16) static uint16_t w103[16];
-  alignas(16) static uint16_t w104[128];
-  alignas(16) static uint16_t w105[8];
-  alignas(16) static uint16_t w106[128];
-  alignas(16) static uint16_t w107[16];
-  alignas(16) static uint16_t w108[256];
-  alignas(16) static uint16_t w109[16];
-  alignas(16) static uint16_t w110[1152];
-  alignas(16) static uint16_t w111[72];
-  alignas(16) static uint16_t w112[648];
-  alignas(16) static uint16_t w113[72];
-  alignas(16) static uint16_t w114[1728];
-  alignas(16) static uint16_t w115[24];
-  alignas(16) static uint16_t w116[2112];
-  alignas(16) static uint16_t w117[88];
-  alignas(16) static uint16_t w118[792];
-  alignas(16) static uint16_t w119[88];
-  alignas(16) static uint16_t w120[2112];
-  alignas(16) static uint16_t w121[24];
-  alignas(16) static uint16_t w122[2304];
-  alignas(16) static uint16_t w123[96];
-  alignas(16) static uint16_t w124[2400];
-  alignas(16) static uint16_t w125[96];
-  alignas(16) static uint16_t w126[2304];
-  alignas(16) static uint16_t w127[24];
-  alignas(16) static uint16_t w128[2304];
-  alignas(16) static uint16_t w129[96];
-  alignas(16) static uint16_t w130[3840];
-  alignas(16) static uint16_t w131[40];
-  alignas(16) static uint16_t w132[9600];
-  alignas(16) static uint16_t w133[240];
-  alignas(16) static uint16_t w134[6000];
-  alignas(16) static uint16_t w135[240];
-  alignas(16) static uint16_t w136[15360];
-  alignas(16) static uint16_t w137[64];
-  alignas(16) static uint16_t w138[15360];
-  alignas(16) static uint16_t w139[240];
-  alignas(16) static uint16_t w140[9600];
-  alignas(16) static uint16_t w141[40];
-  alignas(16) static uint16_t w142[9600];
-  alignas(16) static uint16_t w143[240];
-  alignas(16) static uint16_t w144[6000];
-  alignas(16) static uint16_t w145[240];
-  alignas(16) static uint16_t w146[15360];
-  alignas(16) static uint16_t w147[64];
-  alignas(16) static uint16_t w148[15360];
-  alignas(16) static uint16_t w149[240];
-  alignas(16) static uint16_t w150[9600];
-  alignas(16) static uint16_t w151[40];
-  alignas(16) static uint16_t w152[4800];
-  alignas(16) static uint16_t w153[120];
-  alignas(16) static uint16_t w154[3000];
-  alignas(16) static uint16_t w155[120];
-  alignas(16) static uint16_t w156[3840];
-  alignas(16) static uint16_t w157[32];
-  alignas(16) static uint16_t w158[3840];
-  alignas(16) static uint16_t w159[120];
-  alignas(16) static uint16_t w160[5760];
-  alignas(16) static uint16_t w161[48];
-  alignas(16) static uint16_t w162[6912];
-  alignas(16) static uint16_t w163[144];
-  alignas(16) static uint16_t w164[3600];
-  alignas(16) static uint16_t w165[144];
-  alignas(16) static uint16_t w166[5760];
-  alignas(16) static uint16_t w167[40];
-  alignas(16) static uint16_t w168[5760];
-  alignas(16) static uint16_t w169[144];
-  alignas(16) static uint16_t w170[6912];
-  alignas(16) static uint16_t w171[48];
-  alignas(16) static uint16_t w172[13824];
-  alignas(16) static uint16_t w173[288];
-  alignas(16) static uint16_t w174[7200];
-  alignas(16) static uint16_t w175[288];
-  alignas(16) static uint16_t w176[20736];
-  alignas(16) static uint16_t w177[72];
-  alignas(16) static uint16_t w178[20736];
-  alignas(16) static uint16_t w179[288];
-  alignas(16) static uint16_t w180[27648];
-  alignas(16) static uint16_t w181[96];
-  alignas(16) static uint16_t w182[55296];
-  alignas(16) static uint16_t w183[576];
-  alignas(16) static uint16_t w184[14400];
-  alignas(16) static uint16_t w185[576];
-  alignas(16) static uint16_t w186[82944];
-  alignas(16) static uint16_t w187[144];
-  alignas(16) static uint16_t w188[82944];
-  alignas(16) static uint16_t w189[576];
-  alignas(16) static uint16_t w190[55296];
-  alignas(16) static uint16_t w191[96];
-  alignas(16) static uint16_t w192[55296];
-  alignas(16) static uint16_t w193[576];
-  alignas(16) static uint16_t w194[14400];
-  alignas(16) static uint16_t w195[576];
-  alignas(16) static uint16_t w196[82944];
-  alignas(16) static uint16_t w197[144];
-  alignas(16) static uint16_t w198[82944];
-  alignas(16) static uint16_t w199[576];
-  alignas(16) static uint16_t w200[55296];
-  alignas(16) static uint16_t w201[96];
-  alignas(16) static uint16_t w202[55296];
-  alignas(16) static uint16_t w203[576];
-  alignas(16) static uint16_t w204[589824];
-  alignas(16) static uint16_t w205[1024];
-  alignas(16) static uint16_t w206[1025024];
-  alignas(16) static uint16_t w207[1001];
+  alignas(16) static std::array<uint16_t, 150528> v0;
+  alignas(16) static std::array<uint16_t, 200704> v1;
+  alignas(16) static std::array<uint16_t, 200704> v2;
+  alignas(16) static std::array<uint16_t, 50176> v3;
+  alignas(16) static std::array<uint16_t, 16> v4;
+  alignas(16) static std::array<uint16_t, 8> v5;
+  alignas(16) static std::array<uint16_t, 16> v6;
+  alignas(16) static std::array<uint16_t, 50176> v7;
+  alignas(16) static std::array<uint16_t, 50176> v8;
+  alignas(16) static std::array<uint16_t, 225792> v9;
+  alignas(16) static std::array<uint16_t, 56448> v10;
+  alignas(16) static std::array<uint16_t, 18816> v11;
+  alignas(16) static std::array<uint16_t, 68992> v12;
+  alignas(16) static std::array<uint16_t, 68992> v13;
+  alignas(16) static std::array<uint16_t, 18816> v14;
+  alignas(16) static std::array<uint16_t, 18816> v15;
+  alignas(16) static std::array<uint16_t, 75264> v16;
+  alignas(16) static std::array<uint16_t, 75264> v17;
+  alignas(16) static std::array<uint16_t, 18816> v18;
+  alignas(16) static std::array<uint16_t, 18816> v19;
+  alignas(16) static std::array<uint16_t, 96> v20;
+  alignas(16) static std::array<uint16_t, 24> v21;
+  alignas(16) static std::array<uint16_t, 96> v22;
+  alignas(16) static std::array<uint16_t, 18816> v23;
+  alignas(16) static std::array<uint16_t, 7840> v24;
+  alignas(16) static std::array<uint16_t, 47040> v25;
+  alignas(16) static std::array<uint16_t, 47040> v26;
+  alignas(16) static std::array<uint16_t, 47040> v27;
+  alignas(16) static std::array<uint16_t, 47040> v28;
+  alignas(16) static std::array<uint16_t, 240> v29;
+  alignas(16) static std::array<uint16_t, 64> v30;
+  alignas(16) static std::array<uint16_t, 240> v31;
+  alignas(16) static std::array<uint16_t, 47040> v32;
+  alignas(16) static std::array<uint16_t, 7840> v33;
+  alignas(16) static std::array<uint16_t, 7840> v34;
+  alignas(16) static std::array<uint16_t, 47040> v35;
+  alignas(16) static std::array<uint16_t, 47040> v36;
+  alignas(16) static std::array<uint16_t, 47040> v37;
+  alignas(16) static std::array<uint16_t, 47040> v38;
+  alignas(16) static std::array<uint16_t, 240> v39;
+  alignas(16) static std::array<uint16_t, 64> v40;
+  alignas(16) static std::array<uint16_t, 240> v41;
+  alignas(16) static std::array<uint16_t, 47040> v42;
+  alignas(16) static std::array<uint16_t, 7840> v43;
+  alignas(16) static std::array<uint16_t, 7840> v44;
+  alignas(16) static std::array<uint16_t, 23520> v45;
+  alignas(16) static std::array<uint16_t, 23520> v46;
+  alignas(16) static std::array<uint16_t, 23520> v47;
+  alignas(16) static std::array<uint16_t, 23520> v48;
+  alignas(16) static std::array<uint16_t, 120> v49;
+  alignas(16) static std::array<uint16_t, 32> v50;
+  alignas(16) static std::array<uint16_t, 120> v51;
+  alignas(16) static std::array<uint16_t, 23520> v52;
+  alignas(16) static std::array<uint16_t, 9408> v53;
+  alignas(16) static std::array<uint16_t, 28224> v54;
+  alignas(16) static std::array<uint16_t, 28224> v55;
+  alignas(16) static std::array<uint16_t, 28224> v56;
+  alignas(16) static std::array<uint16_t, 28224> v57;
+  alignas(16) static std::array<uint16_t, 144> v58;
+  alignas(16) static std::array<uint16_t, 40> v59;
+  alignas(16) static std::array<uint16_t, 144> v60;
+  alignas(16) static std::array<uint16_t, 28224> v61;
+  alignas(16) static std::array<uint16_t, 9408> v62;
+  alignas(16) static std::array<uint16_t, 9408> v63;
+  alignas(16) static std::array<uint16_t, 56448> v64;
+  alignas(16) static std::array<uint16_t, 56448> v65;
+  alignas(16) static std::array<uint16_t, 14112> v66;
+  alignas(16) static std::array<uint16_t, 14112> v67;
+  alignas(16) static std::array<uint16_t, 288> v68;
+  alignas(16) static std::array<uint16_t, 72> v69;
+  alignas(16) static std::array<uint16_t, 288> v70;
+  alignas(16) static std::array<uint16_t, 14112> v71;
+  alignas(16) static std::array<uint16_t, 4704> v72;
+  alignas(16) static std::array<uint16_t, 28224> v73;
+  alignas(16) static std::array<uint16_t, 28224> v74;
+  alignas(16) static std::array<uint16_t, 28224> v75;
+  alignas(16) static std::array<uint16_t, 28224> v76;
+  alignas(16) static std::array<uint16_t, 576> v77;
+  alignas(16) static std::array<uint16_t, 144> v78;
+  alignas(16) static std::array<uint16_t, 576> v79;
+  alignas(16) static std::array<uint16_t, 28224> v80;
+  alignas(16) static std::array<uint16_t, 4704> v81;
+  alignas(16) static std::array<uint16_t, 4704> v82;
+  alignas(16) static std::array<uint16_t, 28224> v83;
+  alignas(16) static std::array<uint16_t, 28224> v84;
+  alignas(16) static std::array<uint16_t, 28224> v85;
+  alignas(16) static std::array<uint16_t, 28224> v86;
+  alignas(16) static std::array<uint16_t, 576> v87;
+  alignas(16) static std::array<uint16_t, 144> v88;
+  alignas(16) static std::array<uint16_t, 576> v89;
+  alignas(16) static std::array<uint16_t, 28224> v90;
+  alignas(16) static std::array<uint16_t, 4704> v91;
+  alignas(16) static std::array<uint16_t, 4704> v92;
+  alignas(16) static std::array<uint16_t, 28224> v93;
+  alignas(16) static std::array<uint16_t, 28224> v94;
+  alignas(16) static std::array<uint16_t, 576> v95;
+  alignas(16) static std::array<uint16_t, 1024> v96;
+  alignas(16) static std::array<uint16_t, 1024> v97;
+  alignas(16) static std::array<uint16_t, 1024> v98;
+  alignas(16) static std::array<uint16_t, 1001> v99;
+  alignas(16) static std::array<uint16_t, 432> w100;
+  alignas(16) static std::array<uint16_t, 16> w101;
+  alignas(16) static std::array<uint16_t, 144> w102;
+  alignas(16) static std::array<uint16_t, 16> w103;
+  alignas(16) static std::array<uint16_t, 128> w104;
+  alignas(16) static std::array<uint16_t, 8> w105;
+  alignas(16) static std::array<uint16_t, 128> w106;
+  alignas(16) static std::array<uint16_t, 16> w107;
+  alignas(16) static std::array<uint16_t, 256> w108;
+  alignas(16) static std::array<uint16_t, 16> w109;
+  alignas(16) static std::array<uint16_t, 1152> w110;
+  alignas(16) static std::array<uint16_t, 72> w111;
+  alignas(16) static std::array<uint16_t, 648> w112;
+  alignas(16) static std::array<uint16_t, 72> w113;
+  alignas(16) static std::array<uint16_t, 1728> w114;
+  alignas(16) static std::array<uint16_t, 24> w115;
+  alignas(16) static std::array<uint16_t, 2112> w116;
+  alignas(16) static std::array<uint16_t, 88> w117;
+  alignas(16) static std::array<uint16_t, 792> w118;
+  alignas(16) static std::array<uint16_t, 88> w119;
+  alignas(16) static std::array<uint16_t, 2112> w120;
+  alignas(16) static std::array<uint16_t, 24> w121;
+  alignas(16) static std::array<uint16_t, 2304> w122;
+  alignas(16) static std::array<uint16_t, 96> w123;
+  alignas(16) static std::array<uint16_t, 2400> w124;
+  alignas(16) static std::array<uint16_t, 96> w125;
+  alignas(16) static std::array<uint16_t, 2304> w126;
+  alignas(16) static std::array<uint16_t, 24> w127;
+  alignas(16) static std::array<uint16_t, 2304> w128;
+  alignas(16) static std::array<uint16_t, 96> w129;
+  alignas(16) static std::array<uint16_t, 3840> w130;
+  alignas(16) static std::array<uint16_t, 40> w131;
+  alignas(16) static std::array<uint16_t, 9600> w132;
+  alignas(16) static std::array<uint16_t, 240> w133;
+  alignas(16) static std::array<uint16_t, 6000> w134;
+  alignas(16) static std::array<uint16_t, 240> w135;
+  alignas(16) static std::array<uint16_t, 15360> w136;
+  alignas(16) static std::array<uint16_t, 64> w137;
+  alignas(16) static std::array<uint16_t, 15360> w138;
+  alignas(16) static std::array<uint16_t, 240> w139;
+  alignas(16) static std::array<uint16_t, 9600> w140;
+  alignas(16) static std::array<uint16_t, 40> w141;
+  alignas(16) static std::array<uint16_t, 9600> w142;
+  alignas(16) static std::array<uint16_t, 240> w143;
+  alignas(16) static std::array<uint16_t, 6000> w144;
+  alignas(16) static std::array<uint16_t, 240> w145;
+  alignas(16) static std::array<uint16_t, 15360> w146;
+  alignas(16) static std::array<uint16_t, 64> w147;
+  alignas(16) static std::array<uint16_t, 15360> w148;
+  alignas(16) static std::array<uint16_t, 240> w149;
+  alignas(16) static std::array<uint16_t, 9600> w150;
+  alignas(16) static std::array<uint16_t, 40> w151;
+  alignas(16) static std::array<uint16_t, 4800> w152;
+  alignas(16) static std::array<uint16_t, 120> w153;
+  alignas(16) static std::array<uint16_t, 3000> w154;
+  alignas(16) static std::array<uint16_t, 120> w155;
+  alignas(16) static std::array<uint16_t, 3840> w156;
+  alignas(16) static std::array<uint16_t, 32> w157;
+  alignas(16) static std::array<uint16_t, 3840> w158;
+  alignas(16) static std::array<uint16_t, 120> w159;
+  alignas(16) static std::array<uint16_t, 5760> w160;
+  alignas(16) static std::array<uint16_t, 48> w161;
+  alignas(16) static std::array<uint16_t, 6912> w162;
+  alignas(16) static std::array<uint16_t, 144> w163;
+  alignas(16) static std::array<uint16_t, 3600> w164;
+  alignas(16) static std::array<uint16_t, 144> w165;
+  alignas(16) static std::array<uint16_t, 5760> w166;
+  alignas(16) static std::array<uint16_t, 40> w167;
+  alignas(16) static std::array<uint16_t, 5760> w168;
+  alignas(16) static std::array<uint16_t, 144> w169;
+  alignas(16) static std::array<uint16_t, 6912> w170;
+  alignas(16) static std::array<uint16_t, 48> w171;
+  alignas(16) static std::array<uint16_t, 13824> w172;
+  alignas(16) static std::array<uint16_t, 288> w173;
+  alignas(16) static std::array<uint16_t, 7200> w174;
+  alignas(16) static std::array<uint16_t, 288> w175;
+  alignas(16) static std::array<uint16_t, 20736> w176;
+  alignas(16) static std::array<uint16_t, 72> w177;
+  alignas(16) static std::array<uint16_t, 20736> w178;
+  alignas(16) static std::array<uint16_t, 288> w179;
+  alignas(16) static std::array<uint16_t, 27648> w180;
+  alignas(16) static std::array<uint16_t, 96> w181;
+  alignas(16) static std::array<uint16_t, 55296> w182;
+  alignas(16) static std::array<uint16_t, 576> w183;
+  alignas(16) static std::array<uint16_t, 14400> w184;
+  alignas(16) static std::array<uint16_t, 576> w185;
+  alignas(16) static std::array<uint16_t, 82944> w186;
+  alignas(16) static std::array<uint16_t, 144> w187;
+  alignas(16) static std::array<uint16_t, 82944> w188;
+  alignas(16) static std::array<uint16_t, 576> w189;
+  alignas(16) static std::array<uint16_t, 55296> w190;
+  alignas(16) static std::array<uint16_t, 96> w191;
+  alignas(16) static std::array<uint16_t, 55296> w192;
+  alignas(16) static std::array<uint16_t, 576> w193;
+  alignas(16) static std::array<uint16_t, 14400> w194;
+  alignas(16) static std::array<uint16_t, 576> w195;
+  alignas(16) static std::array<uint16_t, 82944> w196;
+  alignas(16) static std::array<uint16_t, 144> w197;
+  alignas(16) static std::array<uint16_t, 82944> w198;
+  alignas(16) static std::array<uint16_t, 576> w199;
+  alignas(16) static std::array<uint16_t, 55296> w200;
+  alignas(16) static std::array<uint16_t, 96> w201;
+  alignas(16) static std::array<uint16_t, 55296> w202;
+  alignas(16) static std::array<uint16_t, 576> w203;
+  alignas(16) static std::array<uint16_t, 589824> w204;
+  alignas(16) static std::array<uint16_t, 1024> w205;
+  alignas(16) static std::array<uint16_t, 1025024> w206;
+  alignas(16) static std::array<uint16_t, 1001> w207;
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.0f, +1.0f), std::ref(rng));
   auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
-  std::generate(v0, v0 + 150528, std::ref(f16rng));
-  std::generate(v1, v1 + 200704, std::ref(f16rng));
-  std::generate(v2, v2 + 200704, std::ref(f16rng));
-  std::generate(v3, v3 + 50176, std::ref(f16rng));
-  std::generate(v4, v4 + 16, std::ref(f16rng));
-  std::generate(v5, v5 + 8, std::ref(f16rng));
-  std::generate(v6, v6 + 16, std::ref(f16rng));
-  std::generate(v7, v7 + 50176, std::ref(f16rng));
-  std::generate(v8, v8 + 50176, std::ref(f16rng));
-  std::generate(v9, v9 + 225792, std::ref(f16rng));
-  std::generate(v10, v10 + 56448, std::ref(f16rng));
-  std::generate(v11, v11 + 18816, std::ref(f16rng));
-  std::generate(v12, v12 + 68992, std::ref(f16rng));
-  std::generate(v13, v13 + 68992, std::ref(f16rng));
-  std::generate(v14, v14 + 18816, std::ref(f16rng));
-  std::generate(v15, v15 + 18816, std::ref(f16rng));
-  std::generate(v16, v16 + 75264, std::ref(f16rng));
-  std::generate(v17, v17 + 75264, std::ref(f16rng));
-  std::generate(v18, v18 + 18816, std::ref(f16rng));
-  std::generate(v19, v19 + 18816, std::ref(f16rng));
-  std::generate(v20, v20 + 96, std::ref(f16rng));
-  std::generate(v21, v21 + 24, std::ref(f16rng));
-  std::generate(v22, v22 + 96, std::ref(f16rng));
-  std::generate(v23, v23 + 18816, std::ref(f16rng));
-  std::generate(v24, v24 + 7840, std::ref(f16rng));
-  std::generate(v25, v25 + 47040, std::ref(f16rng));
-  std::generate(v26, v26 + 47040, std::ref(f16rng));
-  std::generate(v27, v27 + 47040, std::ref(f16rng));
-  std::generate(v28, v28 + 47040, std::ref(f16rng));
-  std::generate(v29, v29 + 240, std::ref(f16rng));
-  std::generate(v30, v30 + 64, std::ref(f16rng));
-  std::generate(v31, v31 + 240, std::ref(f16rng));
-  std::generate(v32, v32 + 47040, std::ref(f16rng));
-  std::generate(v33, v33 + 7840, std::ref(f16rng));
-  std::generate(v34, v34 + 7840, std::ref(f16rng));
-  std::generate(v35, v35 + 47040, std::ref(f16rng));
-  std::generate(v36, v36 + 47040, std::ref(f16rng));
-  std::generate(v37, v37 + 47040, std::ref(f16rng));
-  std::generate(v38, v38 + 47040, std::ref(f16rng));
-  std::generate(v39, v39 + 240, std::ref(f16rng));
-  std::generate(v40, v40 + 64, std::ref(f16rng));
-  std::generate(v41, v41 + 240, std::ref(f16rng));
-  std::generate(v42, v42 + 47040, std::ref(f16rng));
-  std::generate(v43, v43 + 7840, std::ref(f16rng));
-  std::generate(v44, v44 + 7840, std::ref(f16rng));
-  std::generate(v45, v45 + 23520, std::ref(f16rng));
-  std::generate(v46, v46 + 23520, std::ref(f16rng));
-  std::generate(v47, v47 + 23520, std::ref(f16rng));
-  std::generate(v48, v48 + 23520, std::ref(f16rng));
-  std::generate(v49, v49 + 120, std::ref(f16rng));
-  std::generate(v50, v50 + 32, std::ref(f16rng));
-  std::generate(v51, v51 + 120, std::ref(f16rng));
-  std::generate(v52, v52 + 23520, std::ref(f16rng));
-  std::generate(v53, v53 + 9408, std::ref(f16rng));
-  std::generate(v54, v54 + 28224, std::ref(f16rng));
-  std::generate(v55, v55 + 28224, std::ref(f16rng));
-  std::generate(v56, v56 + 28224, std::ref(f16rng));
-  std::generate(v57, v57 + 28224, std::ref(f16rng));
-  std::generate(v58, v58 + 144, std::ref(f16rng));
-  std::generate(v59, v59 + 40, std::ref(f16rng));
-  std::generate(v60, v60 + 144, std::ref(f16rng));
-  std::generate(v61, v61 + 28224, std::ref(f16rng));
-  std::generate(v62, v62 + 9408, std::ref(f16rng));
-  std::generate(v63, v63 + 9408, std::ref(f16rng));
-  std::generate(v64, v64 + 56448, std::ref(f16rng));
-  std::generate(v65, v65 + 56448, std::ref(f16rng));
-  std::generate(v66, v66 + 14112, std::ref(f16rng));
-  std::generate(v67, v67 + 14112, std::ref(f16rng));
-  std::generate(v68, v68 + 288, std::ref(f16rng));
-  std::generate(v69, v69 + 72, std::ref(f16rng));
-  std::generate(v70, v70 + 288, std::ref(f16rng));
-  std::generate(v71, v71 + 14112, std::ref(f16rng));
-  std::generate(v72, v72 + 4704, std::ref(f16rng));
-  std::generate(v73, v73 + 28224, std::ref(f16rng));
-  std::generate(v74, v74 + 28224, std::ref(f16rng));
-  std::generate(v75, v75 + 28224, std::ref(f16rng));
-  std::generate(v76, v76 + 28224, std::ref(f16rng));
-  std::generate(v77, v77 + 576, std::ref(f16rng));
-  std::generate(v78, v78 + 144, std::ref(f16rng));
-  std::generate(v79, v79 + 576, std::ref(f16rng));
-  std::generate(v80, v80 + 28224, std::ref(f16rng));
-  std::generate(v81, v81 + 4704, std::ref(f16rng));
-  std::generate(v82, v82 + 4704, std::ref(f16rng));
-  std::generate(v83, v83 + 28224, std::ref(f16rng));
-  std::generate(v84, v84 + 28224, std::ref(f16rng));
-  std::generate(v85, v85 + 28224, std::ref(f16rng));
-  std::generate(v86, v86 + 28224, std::ref(f16rng));
-  std::generate(v87, v87 + 576, std::ref(f16rng));
-  std::generate(v88, v88 + 144, std::ref(f16rng));
-  std::generate(v89, v89 + 576, std::ref(f16rng));
-  std::generate(v90, v90 + 28224, std::ref(f16rng));
-  std::generate(v91, v91 + 4704, std::ref(f16rng));
-  std::generate(v92, v92 + 4704, std::ref(f16rng));
-  std::generate(v93, v93 + 28224, std::ref(f16rng));
-  std::generate(v94, v94 + 28224, std::ref(f16rng));
-  std::generate(v95, v95 + 576, std::ref(f16rng));
-  std::generate(v96, v96 + 1024, std::ref(f16rng));
-  std::generate(v97, v97 + 1024, std::ref(f16rng));
-  std::generate(v98, v98 + 1024, std::ref(f16rng));
-  std::generate(v99, v99 + 1001, std::ref(f16rng));
-  std::generate(w100, w100 + 432, std::ref(f16rng));
-  std::generate(w101, w101 + 16, std::ref(f16rng));
-  std::generate(w102, w102 + 144, std::ref(f16rng));
-  std::generate(w103, w103 + 16, std::ref(f16rng));
-  std::generate(w104, w104 + 128, std::ref(f16rng));
-  std::generate(w105, w105 + 8, std::ref(f16rng));
-  std::generate(w106, w106 + 128, std::ref(f16rng));
-  std::generate(w107, w107 + 16, std::ref(f16rng));
-  std::generate(w108, w108 + 256, std::ref(f16rng));
-  std::generate(w109, w109 + 16, std::ref(f16rng));
-  std::generate(w110, w110 + 1152, std::ref(f16rng));
-  std::generate(w111, w111 + 72, std::ref(f16rng));
-  std::generate(w112, w112 + 648, std::ref(f16rng));
-  std::generate(w113, w113 + 72, std::ref(f16rng));
-  std::generate(w114, w114 + 1728, std::ref(f16rng));
-  std::generate(w115, w115 + 24, std::ref(f16rng));
-  std::generate(w116, w116 + 2112, std::ref(f16rng));
-  std::generate(w117, w117 + 88, std::ref(f16rng));
-  std::generate(w118, w118 + 792, std::ref(f16rng));
-  std::generate(w119, w119 + 88, std::ref(f16rng));
-  std::generate(w120, w120 + 2112, std::ref(f16rng));
-  std::generate(w121, w121 + 24, std::ref(f16rng));
-  std::generate(w122, w122 + 2304, std::ref(f16rng));
-  std::generate(w123, w123 + 96, std::ref(f16rng));
-  std::generate(w124, w124 + 2400, std::ref(f16rng));
-  std::generate(w125, w125 + 96, std::ref(f16rng));
-  std::generate(w126, w126 + 2304, std::ref(f16rng));
-  std::generate(w127, w127 + 24, std::ref(f16rng));
-  std::generate(w128, w128 + 2304, std::ref(f16rng));
-  std::generate(w129, w129 + 96, std::ref(f16rng));
-  std::generate(w130, w130 + 3840, std::ref(f16rng));
-  std::generate(w131, w131 + 40, std::ref(f16rng));
-  std::generate(w132, w132 + 9600, std::ref(f16rng));
-  std::generate(w133, w133 + 240, std::ref(f16rng));
-  std::generate(w134, w134 + 6000, std::ref(f16rng));
-  std::generate(w135, w135 + 240, std::ref(f16rng));
-  std::generate(w136, w136 + 15360, std::ref(f16rng));
-  std::generate(w137, w137 + 64, std::ref(f16rng));
-  std::generate(w138, w138 + 15360, std::ref(f16rng));
-  std::generate(w139, w139 + 240, std::ref(f16rng));
-  std::generate(w140, w140 + 9600, std::ref(f16rng));
-  std::generate(w141, w141 + 40, std::ref(f16rng));
-  std::generate(w142, w142 + 9600, std::ref(f16rng));
-  std::generate(w143, w143 + 240, std::ref(f16rng));
-  std::generate(w144, w144 + 6000, std::ref(f16rng));
-  std::generate(w145, w145 + 240, std::ref(f16rng));
-  std::generate(w146, w146 + 15360, std::ref(f16rng));
-  std::generate(w147, w147 + 64, std::ref(f16rng));
-  std::generate(w148, w148 + 15360, std::ref(f16rng));
-  std::generate(w149, w149 + 240, std::ref(f16rng));
-  std::generate(w150, w150 + 9600, std::ref(f16rng));
-  std::generate(w151, w151 + 40, std::ref(f16rng));
-  std::generate(w152, w152 + 4800, std::ref(f16rng));
-  std::generate(w153, w153 + 120, std::ref(f16rng));
-  std::generate(w154, w154 + 3000, std::ref(f16rng));
-  std::generate(w155, w155 + 120, std::ref(f16rng));
-  std::generate(w156, w156 + 3840, std::ref(f16rng));
-  std::generate(w157, w157 + 32, std::ref(f16rng));
-  std::generate(w158, w158 + 3840, std::ref(f16rng));
-  std::generate(w159, w159 + 120, std::ref(f16rng));
-  std::generate(w160, w160 + 5760, std::ref(f16rng));
-  std::generate(w161, w161 + 48, std::ref(f16rng));
-  std::generate(w162, w162 + 6912, std::ref(f16rng));
-  std::generate(w163, w163 + 144, std::ref(f16rng));
-  std::generate(w164, w164 + 3600, std::ref(f16rng));
-  std::generate(w165, w165 + 144, std::ref(f16rng));
-  std::generate(w166, w166 + 5760, std::ref(f16rng));
-  std::generate(w167, w167 + 40, std::ref(f16rng));
-  std::generate(w168, w168 + 5760, std::ref(f16rng));
-  std::generate(w169, w169 + 144, std::ref(f16rng));
-  std::generate(w170, w170 + 6912, std::ref(f16rng));
-  std::generate(w171, w171 + 48, std::ref(f16rng));
-  std::generate(w172, w172 + 13824, std::ref(f16rng));
-  std::generate(w173, w173 + 288, std::ref(f16rng));
-  std::generate(w174, w174 + 7200, std::ref(f16rng));
-  std::generate(w175, w175 + 288, std::ref(f16rng));
-  std::generate(w176, w176 + 20736, std::ref(f16rng));
-  std::generate(w177, w177 + 72, std::ref(f16rng));
-  std::generate(w178, w178 + 20736, std::ref(f16rng));
-  std::generate(w179, w179 + 288, std::ref(f16rng));
-  std::generate(w180, w180 + 27648, std::ref(f16rng));
-  std::generate(w181, w181 + 96, std::ref(f16rng));
-  std::generate(w182, w182 + 55296, std::ref(f16rng));
-  std::generate(w183, w183 + 576, std::ref(f16rng));
-  std::generate(w184, w184 + 14400, std::ref(f16rng));
-  std::generate(w185, w185 + 576, std::ref(f16rng));
-  std::generate(w186, w186 + 82944, std::ref(f16rng));
-  std::generate(w187, w187 + 144, std::ref(f16rng));
-  std::generate(w188, w188 + 82944, std::ref(f16rng));
-  std::generate(w189, w189 + 576, std::ref(f16rng));
-  std::generate(w190, w190 + 55296, std::ref(f16rng));
-  std::generate(w191, w191 + 96, std::ref(f16rng));
-  std::generate(w192, w192 + 55296, std::ref(f16rng));
-  std::generate(w193, w193 + 576, std::ref(f16rng));
-  std::generate(w194, w194 + 14400, std::ref(f16rng));
-  std::generate(w195, w195 + 576, std::ref(f16rng));
-  std::generate(w196, w196 + 82944, std::ref(f16rng));
-  std::generate(w197, w197 + 144, std::ref(f16rng));
-  std::generate(w198, w198 + 82944, std::ref(f16rng));
-  std::generate(w199, w199 + 576, std::ref(f16rng));
-  std::generate(w200, w200 + 55296, std::ref(f16rng));
-  std::generate(w201, w201 + 96, std::ref(f16rng));
-  std::generate(w202, w202 + 55296, std::ref(f16rng));
-  std::generate(w203, w203 + 576, std::ref(f16rng));
-  std::generate(w204, w204 + 589824, std::ref(f16rng));
-  std::generate(w205, w205 + 1024, std::ref(f16rng));
-  std::generate(w206, w206 + 1025024, std::ref(f16rng));
-  std::generate(w207, w207 + 1001, std::ref(f16rng));
+  std::generate(v0.begin(), v0.end(), std::ref(f16rng));
+  std::generate(v1.begin(), v1.end(), std::ref(f16rng));
+  std::generate(v2.begin(), v2.end(), std::ref(f16rng));
+  std::generate(v3.begin(), v3.end(), std::ref(f16rng));
+  std::generate(v4.begin(), v4.end(), std::ref(f16rng));
+  std::generate(v5.begin(), v5.end(), std::ref(f16rng));
+  std::generate(v6.begin(), v6.end(), std::ref(f16rng));
+  std::generate(v7.begin(), v7.end(), std::ref(f16rng));
+  std::generate(v8.begin(), v8.end(), std::ref(f16rng));
+  std::generate(v9.begin(), v9.end(), std::ref(f16rng));
+  std::generate(v10.begin(), v10.end(), std::ref(f16rng));
+  std::generate(v11.begin(), v11.end(), std::ref(f16rng));
+  std::generate(v12.begin(), v12.end(), std::ref(f16rng));
+  std::generate(v13.begin(), v13.end(), std::ref(f16rng));
+  std::generate(v14.begin(), v14.end(), std::ref(f16rng));
+  std::generate(v15.begin(), v15.end(), std::ref(f16rng));
+  std::generate(v16.begin(), v16.end(), std::ref(f16rng));
+  std::generate(v17.begin(), v17.end(), std::ref(f16rng));
+  std::generate(v18.begin(), v18.end(), std::ref(f16rng));
+  std::generate(v19.begin(), v19.end(), std::ref(f16rng));
+  std::generate(v20.begin(), v20.end(), std::ref(f16rng));
+  std::generate(v21.begin(), v21.end(), std::ref(f16rng));
+  std::generate(v22.begin(), v22.end(), std::ref(f16rng));
+  std::generate(v23.begin(), v23.end(), std::ref(f16rng));
+  std::generate(v24.begin(), v24.end(), std::ref(f16rng));
+  std::generate(v25.begin(), v25.end(), std::ref(f16rng));
+  std::generate(v26.begin(), v26.end(), std::ref(f16rng));
+  std::generate(v27.begin(), v27.end(), std::ref(f16rng));
+  std::generate(v28.begin(), v28.end(), std::ref(f16rng));
+  std::generate(v29.begin(), v29.end(), std::ref(f16rng));
+  std::generate(v30.begin(), v30.end(), std::ref(f16rng));
+  std::generate(v31.begin(), v31.end(), std::ref(f16rng));
+  std::generate(v32.begin(), v32.end(), std::ref(f16rng));
+  std::generate(v33.begin(), v33.end(), std::ref(f16rng));
+  std::generate(v34.begin(), v34.end(), std::ref(f16rng));
+  std::generate(v35.begin(), v35.end(), std::ref(f16rng));
+  std::generate(v36.begin(), v36.end(), std::ref(f16rng));
+  std::generate(v37.begin(), v37.end(), std::ref(f16rng));
+  std::generate(v38.begin(), v38.end(), std::ref(f16rng));
+  std::generate(v39.begin(), v39.end(), std::ref(f16rng));
+  std::generate(v40.begin(), v40.end(), std::ref(f16rng));
+  std::generate(v41.begin(), v41.end(), std::ref(f16rng));
+  std::generate(v42.begin(), v42.end(), std::ref(f16rng));
+  std::generate(v43.begin(), v43.end(), std::ref(f16rng));
+  std::generate(v44.begin(), v44.end(), std::ref(f16rng));
+  std::generate(v45.begin(), v45.end(), std::ref(f16rng));
+  std::generate(v46.begin(), v46.end(), std::ref(f16rng));
+  std::generate(v47.begin(), v47.end(), std::ref(f16rng));
+  std::generate(v48.begin(), v48.end(), std::ref(f16rng));
+  std::generate(v49.begin(), v49.end(), std::ref(f16rng));
+  std::generate(v50.begin(), v50.end(), std::ref(f16rng));
+  std::generate(v51.begin(), v51.end(), std::ref(f16rng));
+  std::generate(v52.begin(), v52.end(), std::ref(f16rng));
+  std::generate(v53.begin(), v53.end(), std::ref(f16rng));
+  std::generate(v54.begin(), v54.end(), std::ref(f16rng));
+  std::generate(v55.begin(), v55.end(), std::ref(f16rng));
+  std::generate(v56.begin(), v56.end(), std::ref(f16rng));
+  std::generate(v57.begin(), v57.end(), std::ref(f16rng));
+  std::generate(v58.begin(), v58.end(), std::ref(f16rng));
+  std::generate(v59.begin(), v59.end(), std::ref(f16rng));
+  std::generate(v60.begin(), v60.end(), std::ref(f16rng));
+  std::generate(v61.begin(), v61.end(), std::ref(f16rng));
+  std::generate(v62.begin(), v62.end(), std::ref(f16rng));
+  std::generate(v63.begin(), v63.end(), std::ref(f16rng));
+  std::generate(v64.begin(), v64.end(), std::ref(f16rng));
+  std::generate(v65.begin(), v65.end(), std::ref(f16rng));
+  std::generate(v66.begin(), v66.end(), std::ref(f16rng));
+  std::generate(v67.begin(), v67.end(), std::ref(f16rng));
+  std::generate(v68.begin(), v68.end(), std::ref(f16rng));
+  std::generate(v69.begin(), v69.end(), std::ref(f16rng));
+  std::generate(v70.begin(), v70.end(), std::ref(f16rng));
+  std::generate(v71.begin(), v71.end(), std::ref(f16rng));
+  std::generate(v72.begin(), v72.end(), std::ref(f16rng));
+  std::generate(v73.begin(), v73.end(), std::ref(f16rng));
+  std::generate(v74.begin(), v74.end(), std::ref(f16rng));
+  std::generate(v75.begin(), v75.end(), std::ref(f16rng));
+  std::generate(v76.begin(), v76.end(), std::ref(f16rng));
+  std::generate(v77.begin(), v77.end(), std::ref(f16rng));
+  std::generate(v78.begin(), v78.end(), std::ref(f16rng));
+  std::generate(v79.begin(), v79.end(), std::ref(f16rng));
+  std::generate(v80.begin(), v80.end(), std::ref(f16rng));
+  std::generate(v81.begin(), v81.end(), std::ref(f16rng));
+  std::generate(v82.begin(), v82.end(), std::ref(f16rng));
+  std::generate(v83.begin(), v83.end(), std::ref(f16rng));
+  std::generate(v84.begin(), v84.end(), std::ref(f16rng));
+  std::generate(v85.begin(), v85.end(), std::ref(f16rng));
+  std::generate(v86.begin(), v86.end(), std::ref(f16rng));
+  std::generate(v87.begin(), v87.end(), std::ref(f16rng));
+  std::generate(v88.begin(), v88.end(), std::ref(f16rng));
+  std::generate(v89.begin(), v89.end(), std::ref(f16rng));
+  std::generate(v90.begin(), v90.end(), std::ref(f16rng));
+  std::generate(v91.begin(), v91.end(), std::ref(f16rng));
+  std::generate(v92.begin(), v92.end(), std::ref(f16rng));
+  std::generate(v93.begin(), v93.end(), std::ref(f16rng));
+  std::generate(v94.begin(), v94.end(), std::ref(f16rng));
+  std::generate(v95.begin(), v95.end(), std::ref(f16rng));
+  std::generate(v96.begin(), v96.end(), std::ref(f16rng));
+  std::generate(v97.begin(), v97.end(), std::ref(f16rng));
+  std::generate(v98.begin(), v98.end(), std::ref(f16rng));
+  std::generate(v99.begin(), v99.end(), std::ref(f16rng));
+  std::generate(w100.begin(), w100.end(), std::ref(f16rng));
+  std::generate(w101.begin(), w101.end(), std::ref(f16rng));
+  std::generate(w102.begin(), w102.end(), std::ref(f16rng));
+  std::generate(w103.begin(), w103.end(), std::ref(f16rng));
+  std::generate(w104.begin(), w104.end(), std::ref(f16rng));
+  std::generate(w105.begin(), w105.end(), std::ref(f16rng));
+  std::generate(w106.begin(), w106.end(), std::ref(f16rng));
+  std::generate(w107.begin(), w107.end(), std::ref(f16rng));
+  std::generate(w108.begin(), w108.end(), std::ref(f16rng));
+  std::generate(w109.begin(), w109.end(), std::ref(f16rng));
+  std::generate(w110.begin(), w110.end(), std::ref(f16rng));
+  std::generate(w111.begin(), w111.end(), std::ref(f16rng));
+  std::generate(w112.begin(), w112.end(), std::ref(f16rng));
+  std::generate(w113.begin(), w113.end(), std::ref(f16rng));
+  std::generate(w114.begin(), w114.end(), std::ref(f16rng));
+  std::generate(w115.begin(), w115.end(), std::ref(f16rng));
+  std::generate(w116.begin(), w116.end(), std::ref(f16rng));
+  std::generate(w117.begin(), w117.end(), std::ref(f16rng));
+  std::generate(w118.begin(), w118.end(), std::ref(f16rng));
+  std::generate(w119.begin(), w119.end(), std::ref(f16rng));
+  std::generate(w120.begin(), w120.end(), std::ref(f16rng));
+  std::generate(w121.begin(), w121.end(), std::ref(f16rng));
+  std::generate(w122.begin(), w122.end(), std::ref(f16rng));
+  std::generate(w123.begin(), w123.end(), std::ref(f16rng));
+  std::generate(w124.begin(), w124.end(), std::ref(f16rng));
+  std::generate(w125.begin(), w125.end(), std::ref(f16rng));
+  std::generate(w126.begin(), w126.end(), std::ref(f16rng));
+  std::generate(w127.begin(), w127.end(), std::ref(f16rng));
+  std::generate(w128.begin(), w128.end(), std::ref(f16rng));
+  std::generate(w129.begin(), w129.end(), std::ref(f16rng));
+  std::generate(w130.begin(), w130.end(), std::ref(f16rng));
+  std::generate(w131.begin(), w131.end(), std::ref(f16rng));
+  std::generate(w132.begin(), w132.end(), std::ref(f16rng));
+  std::generate(w133.begin(), w133.end(), std::ref(f16rng));
+  std::generate(w134.begin(), w134.end(), std::ref(f16rng));
+  std::generate(w135.begin(), w135.end(), std::ref(f16rng));
+  std::generate(w136.begin(), w136.end(), std::ref(f16rng));
+  std::generate(w137.begin(), w137.end(), std::ref(f16rng));
+  std::generate(w138.begin(), w138.end(), std::ref(f16rng));
+  std::generate(w139.begin(), w139.end(), std::ref(f16rng));
+  std::generate(w140.begin(), w140.end(), std::ref(f16rng));
+  std::generate(w141.begin(), w141.end(), std::ref(f16rng));
+  std::generate(w142.begin(), w142.end(), std::ref(f16rng));
+  std::generate(w143.begin(), w143.end(), std::ref(f16rng));
+  std::generate(w144.begin(), w144.end(), std::ref(f16rng));
+  std::generate(w145.begin(), w145.end(), std::ref(f16rng));
+  std::generate(w146.begin(), w146.end(), std::ref(f16rng));
+  std::generate(w147.begin(), w147.end(), std::ref(f16rng));
+  std::generate(w148.begin(), w148.end(), std::ref(f16rng));
+  std::generate(w149.begin(), w149.end(), std::ref(f16rng));
+  std::generate(w150.begin(), w150.end(), std::ref(f16rng));
+  std::generate(w151.begin(), w151.end(), std::ref(f16rng));
+  std::generate(w152.begin(), w152.end(), std::ref(f16rng));
+  std::generate(w153.begin(), w153.end(), std::ref(f16rng));
+  std::generate(w154.begin(), w154.end(), std::ref(f16rng));
+  std::generate(w155.begin(), w155.end(), std::ref(f16rng));
+  std::generate(w156.begin(), w156.end(), std::ref(f16rng));
+  std::generate(w157.begin(), w157.end(), std::ref(f16rng));
+  std::generate(w158.begin(), w158.end(), std::ref(f16rng));
+  std::generate(w159.begin(), w159.end(), std::ref(f16rng));
+  std::generate(w160.begin(), w160.end(), std::ref(f16rng));
+  std::generate(w161.begin(), w161.end(), std::ref(f16rng));
+  std::generate(w162.begin(), w162.end(), std::ref(f16rng));
+  std::generate(w163.begin(), w163.end(), std::ref(f16rng));
+  std::generate(w164.begin(), w164.end(), std::ref(f16rng));
+  std::generate(w165.begin(), w165.end(), std::ref(f16rng));
+  std::generate(w166.begin(), w166.end(), std::ref(f16rng));
+  std::generate(w167.begin(), w167.end(), std::ref(f16rng));
+  std::generate(w168.begin(), w168.end(), std::ref(f16rng));
+  std::generate(w169.begin(), w169.end(), std::ref(f16rng));
+  std::generate(w170.begin(), w170.end(), std::ref(f16rng));
+  std::generate(w171.begin(), w171.end(), std::ref(f16rng));
+  std::generate(w172.begin(), w172.end(), std::ref(f16rng));
+  std::generate(w173.begin(), w173.end(), std::ref(f16rng));
+  std::generate(w174.begin(), w174.end(), std::ref(f16rng));
+  std::generate(w175.begin(), w175.end(), std::ref(f16rng));
+  std::generate(w176.begin(), w176.end(), std::ref(f16rng));
+  std::generate(w177.begin(), w177.end(), std::ref(f16rng));
+  std::generate(w178.begin(), w178.end(), std::ref(f16rng));
+  std::generate(w179.begin(), w179.end(), std::ref(f16rng));
+  std::generate(w180.begin(), w180.end(), std::ref(f16rng));
+  std::generate(w181.begin(), w181.end(), std::ref(f16rng));
+  std::generate(w182.begin(), w182.end(), std::ref(f16rng));
+  std::generate(w183.begin(), w183.end(), std::ref(f16rng));
+  std::generate(w184.begin(), w184.end(), std::ref(f16rng));
+  std::generate(w185.begin(), w185.end(), std::ref(f16rng));
+  std::generate(w186.begin(), w186.end(), std::ref(f16rng));
+  std::generate(w187.begin(), w187.end(), std::ref(f16rng));
+  std::generate(w188.begin(), w188.end(), std::ref(f16rng));
+  std::generate(w189.begin(), w189.end(), std::ref(f16rng));
+  std::generate(w190.begin(), w190.end(), std::ref(f16rng));
+  std::generate(w191.begin(), w191.end(), std::ref(f16rng));
+  std::generate(w192.begin(), w192.end(), std::ref(f16rng));
+  std::generate(w193.begin(), w193.end(), std::ref(f16rng));
+  std::generate(w194.begin(), w194.end(), std::ref(f16rng));
+  std::generate(w195.begin(), w195.end(), std::ref(f16rng));
+  std::generate(w196.begin(), w196.end(), std::ref(f16rng));
+  std::generate(w197.begin(), w197.end(), std::ref(f16rng));
+  std::generate(w198.begin(), w198.end(), std::ref(f16rng));
+  std::generate(w199.begin(), w199.end(), std::ref(f16rng));
+  std::generate(w200.begin(), w200.end(), std::ref(f16rng));
+  std::generate(w201.begin(), w201.end(), std::ref(f16rng));
+  std::generate(w202.begin(), w202.end(), std::ref(f16rng));
+  std::generate(w203.begin(), w203.end(), std::ref(f16rng));
+  std::generate(w204.begin(), w204.end(), std::ref(f16rng));
+  std::generate(w205.begin(), w205.end(), std::ref(f16rng));
+  std::generate(w206.begin(), w206.end(), std::ref(f16rng));
+  std::generate(w207.begin(), w207.end(), std::ref(f16rng));
 
   ExecutionPlan operators;
   xnn_status status;
@@ -455,7 +456,7 @@
     16 /* output_channels_per_group */,
     3 /* input pixel stride */,
     16 /* output pixel stride */,
-    w100, w101,
+    w100.data(), w101.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op0);
@@ -490,7 +491,7 @@
     1 /* output_channels_per_group */,
     16 /* input pixel stride */,
     16 /* output pixel stride */,
-    w102, w103,
+    w102.data(), w103.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op2);
@@ -524,7 +525,7 @@
     8 /* output_channels_per_group */,
     16 /* input pixel stride */,
     8 /* output pixel stride */,
-    w104, w105,
+    w104.data(), w105.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op4);
@@ -546,7 +547,7 @@
     16 /* output_channels_per_group */,
     8 /* input pixel stride */,
     16 /* output pixel stride */,
-    w106, w107,
+    w106.data(), w107.data(),
     0.0f /* output min */, +0x1.00014Fp+0 /* output max */,
     0 /* flags */,
     &op5);
@@ -579,7 +580,7 @@
     16 /* output_channels_per_group */,
     16 /* input pixel stride */,
     16 /* output pixel stride */,
-    w108, w109,
+    w108.data(), w109.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op7);
@@ -601,7 +602,7 @@
     72 /* output_channels_per_group */,
     16 /* input pixel stride */,
     72 /* output pixel stride */,
-    w110, w111,
+    w110.data(), w111.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op8);
@@ -623,7 +624,7 @@
     1 /* output_channels_per_group */,
     72 /* input pixel stride */,
     72 /* output pixel stride */,
-    w112, w113,
+    w112.data(), w113.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op9);
@@ -645,7 +646,7 @@
     24 /* output_channels_per_group */,
     72 /* input pixel stride */,
     24 /* output pixel stride */,
-    w114, w115,
+    w114.data(), w115.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op10);
@@ -667,7 +668,7 @@
     88 /* output_channels_per_group */,
     24 /* input pixel stride */,
     88 /* output pixel stride */,
-    w116, w117,
+    w116.data(), w117.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op11);
@@ -689,7 +690,7 @@
     1 /* output_channels_per_group */,
     88 /* input pixel stride */,
     88 /* output pixel stride */,
-    w118, w119,
+    w118.data(), w119.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op12);
@@ -711,7 +712,7 @@
     24 /* output_channels_per_group */,
     88 /* input pixel stride */,
     24 /* output pixel stride */,
-    w120, w121,
+    w120.data(), w121.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op13);
@@ -744,7 +745,7 @@
     96 /* output_channels_per_group */,
     24 /* input pixel stride */,
     96 /* output pixel stride */,
-    w122, w123,
+    w122.data(), w123.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op15);
@@ -779,7 +780,7 @@
     1 /* output_channels_per_group */,
     96 /* input pixel stride */,
     96 /* output pixel stride */,
-    w124, w125,
+    w124.data(), w125.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op17);
@@ -826,7 +827,7 @@
     24 /* output_channels_per_group */,
     96 /* input pixel stride */,
     24 /* output pixel stride */,
-    w126, w127,
+    w126.data(), w127.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op20);
@@ -848,7 +849,7 @@
     96 /* output_channels_per_group */,
     24 /* input pixel stride */,
     96 /* output pixel stride */,
-    w128, w129,
+    w128.data(), w129.data(),
     0.0f /* output min */, +0x1.00014Fp+0 /* output max */,
     0 /* flags */,
     &op21);
@@ -881,7 +882,7 @@
     40 /* output_channels_per_group */,
     96 /* input pixel stride */,
     40 /* output pixel stride */,
-    w130, w131,
+    w130.data(), w131.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op23);
@@ -903,7 +904,7 @@
     240 /* output_channels_per_group */,
     40 /* input pixel stride */,
     240 /* output pixel stride */,
-    w132, w133,
+    w132.data(), w133.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op24);
@@ -938,7 +939,7 @@
     1 /* output_channels_per_group */,
     240 /* input pixel stride */,
     240 /* output pixel stride */,
-    w134, w135,
+    w134.data(), w135.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op26);
@@ -985,7 +986,7 @@
     64 /* output_channels_per_group */,
     240 /* input pixel stride */,
     64 /* output pixel stride */,
-    w136, w137,
+    w136.data(), w137.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op29);
@@ -1007,7 +1008,7 @@
     240 /* output_channels_per_group */,
     64 /* input pixel stride */,
     240 /* output pixel stride */,
-    w138, w139,
+    w138.data(), w139.data(),
     0.0f /* output min */, +0x1.00014Fp+0 /* output max */,
     0 /* flags */,
     &op30);
@@ -1040,7 +1041,7 @@
     40 /* output_channels_per_group */,
     240 /* input pixel stride */,
     40 /* output pixel stride */,
-    w140, w141,
+    w140.data(), w141.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op32);
@@ -1073,7 +1074,7 @@
     240 /* output_channels_per_group */,
     40 /* input pixel stride */,
     240 /* output pixel stride */,
-    w142, w143,
+    w142.data(), w143.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op34);
@@ -1108,7 +1109,7 @@
     1 /* output_channels_per_group */,
     240 /* input pixel stride */,
     240 /* output pixel stride */,
-    w144, w145,
+    w144.data(), w145.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op36);
@@ -1155,7 +1156,7 @@
     64 /* output_channels_per_group */,
     240 /* input pixel stride */,
     64 /* output pixel stride */,
-    w146, w147,
+    w146.data(), w147.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op39);
@@ -1177,7 +1178,7 @@
     240 /* output_channels_per_group */,
     64 /* input pixel stride */,
     240 /* output pixel stride */,
-    w148, w149,
+    w148.data(), w149.data(),
     0.0f /* output min */, +0x1.00014Fp+0 /* output max */,
     0 /* flags */,
     &op40);
@@ -1210,7 +1211,7 @@
     40 /* output_channels_per_group */,
     240 /* input pixel stride */,
     40 /* output pixel stride */,
-    w150, w151,
+    w150.data(), w151.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op42);
@@ -1243,7 +1244,7 @@
     120 /* output_channels_per_group */,
     40 /* input pixel stride */,
     120 /* output pixel stride */,
-    w152, w153,
+    w152.data(), w153.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op44);
@@ -1278,7 +1279,7 @@
     1 /* output_channels_per_group */,
     120 /* input pixel stride */,
     120 /* output pixel stride */,
-    w154, w155,
+    w154.data(), w155.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op46);
@@ -1325,7 +1326,7 @@
     32 /* output_channels_per_group */,
     120 /* input pixel stride */,
     32 /* output pixel stride */,
-    w156, w157,
+    w156.data(), w157.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op49);
@@ -1347,7 +1348,7 @@
     120 /* output_channels_per_group */,
     32 /* input pixel stride */,
     120 /* output pixel stride */,
-    w158, w159,
+    w158.data(), w159.data(),
     0.0f /* output min */, +0x1.00014Fp+0 /* output max */,
     0 /* flags */,
     &op50);
@@ -1380,7 +1381,7 @@
     48 /* output_channels_per_group */,
     120 /* input pixel stride */,
     48 /* output pixel stride */,
-    w160, w161,
+    w160.data(), w161.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op52);
@@ -1402,7 +1403,7 @@
     144 /* output_channels_per_group */,
     48 /* input pixel stride */,
     144 /* output pixel stride */,
-    w162, w163,
+    w162.data(), w163.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op53);
@@ -1437,7 +1438,7 @@
     1 /* output_channels_per_group */,
     144 /* input pixel stride */,
     144 /* output pixel stride */,
-    w164, w165,
+    w164.data(), w165.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op55);
@@ -1484,7 +1485,7 @@
     40 /* output_channels_per_group */,
     144 /* input pixel stride */,
     40 /* output pixel stride */,
-    w166, w167,
+    w166.data(), w167.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op58);
@@ -1506,7 +1507,7 @@
     144 /* output_channels_per_group */,
     40 /* input pixel stride */,
     144 /* output pixel stride */,
-    w168, w169,
+    w168.data(), w169.data(),
     0.0f /* output min */, +0x1.00014Fp+0 /* output max */,
     0 /* flags */,
     &op59);
@@ -1539,7 +1540,7 @@
     48 /* output_channels_per_group */,
     144 /* input pixel stride */,
     48 /* output pixel stride */,
-    w170, w171,
+    w170.data(), w171.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op61);
@@ -1572,7 +1573,7 @@
     288 /* output_channels_per_group */,
     48 /* input pixel stride */,
     288 /* output pixel stride */,
-    w172, w173,
+    w172.data(), w173.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op63);
@@ -1607,7 +1608,7 @@
     1 /* output_channels_per_group */,
     288 /* input pixel stride */,
     288 /* output pixel stride */,
-    w174, w175,
+    w174.data(), w175.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op65);
@@ -1654,7 +1655,7 @@
     72 /* output_channels_per_group */,
     288 /* input pixel stride */,
     72 /* output pixel stride */,
-    w176, w177,
+    w176.data(), w177.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op68);
@@ -1676,7 +1677,7 @@
     288 /* output_channels_per_group */,
     72 /* input pixel stride */,
     288 /* output pixel stride */,
-    w178, w179,
+    w178.data(), w179.data(),
     0.0f /* output min */, +0x1.00014Fp+0 /* output max */,
     0 /* flags */,
     &op69);
@@ -1709,7 +1710,7 @@
     96 /* output_channels_per_group */,
     288 /* input pixel stride */,
     96 /* output pixel stride */,
-    w180, w181,
+    w180.data(), w181.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op71);
@@ -1731,7 +1732,7 @@
     576 /* output_channels_per_group */,
     96 /* input pixel stride */,
     576 /* output pixel stride */,
-    w182, w183,
+    w182.data(), w183.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op72);
@@ -1766,7 +1767,7 @@
     1 /* output_channels_per_group */,
     576 /* input pixel stride */,
     576 /* output pixel stride */,
-    w184, w185,
+    w184.data(), w185.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op74);
@@ -1813,7 +1814,7 @@
     144 /* output_channels_per_group */,
     576 /* input pixel stride */,
     144 /* output pixel stride */,
-    w186, w187,
+    w186.data(), w187.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op77);
@@ -1835,7 +1836,7 @@
     576 /* output_channels_per_group */,
     144 /* input pixel stride */,
     576 /* output pixel stride */,
-    w188, w189,
+    w188.data(), w189.data(),
     0.0f /* output min */, +0x1.00014Fp+0 /* output max */,
     0 /* flags */,
     &op78);
@@ -1868,7 +1869,7 @@
     96 /* output_channels_per_group */,
     576 /* input pixel stride */,
     96 /* output pixel stride */,
-    w190, w191,
+    w190.data(), w191.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op80);
@@ -1901,7 +1902,7 @@
     576 /* output_channels_per_group */,
     96 /* input pixel stride */,
     576 /* output pixel stride */,
-    w192, w193,
+    w192.data(), w193.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op82);
@@ -1936,7 +1937,7 @@
     1 /* output_channels_per_group */,
     576 /* input pixel stride */,
     576 /* output pixel stride */,
-    w194, w195,
+    w194.data(), w195.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op84);
@@ -1983,7 +1984,7 @@
     144 /* output_channels_per_group */,
     576 /* input pixel stride */,
     144 /* output pixel stride */,
-    w196, w197,
+    w196.data(), w197.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op87);
@@ -2005,7 +2006,7 @@
     576 /* output_channels_per_group */,
     144 /* input pixel stride */,
     576 /* output pixel stride */,
-    w198, w199,
+    w198.data(), w199.data(),
     0.0f /* output min */, +0x1.00014Fp+0 /* output max */,
     0 /* flags */,
     &op88);
@@ -2038,7 +2039,7 @@
     96 /* output_channels_per_group */,
     576 /* input pixel stride */,
     96 /* output pixel stride */,
-    w200, w201,
+    w200.data(), w201.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op90);
@@ -2071,7 +2072,7 @@
     576 /* output_channels_per_group */,
     96 /* input pixel stride */,
     576 /* output pixel stride */,
-    w202, w203,
+    w202.data(), w203.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op92);
@@ -2118,7 +2119,7 @@
     1024 /* output_channels_per_group */,
     576 /* input pixel stride */,
     1024 /* output pixel stride */,
-    w204, w205,
+    w204.data(), w205.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op95);
@@ -2165,7 +2166,7 @@
     1001 /* output_channels_per_group */,
     1024 /* input pixel stride */,
     1001 /* output pixel stride */,
-    w206, w207,
+    w206.data(), w207.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op98);
@@ -2180,7 +2181,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op0,
     1 /* batch size */, 224 /* input height */, 224 /* input width */,
-    v0 /* input */, v1 /* output */,
+    v0.data() /* input */, v1.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #0" << std::endl;
@@ -2190,7 +2191,7 @@
   status = xnn_setup_hardswish_nc_f16(
     op1,
     12544 /* batch size */,
-    v1 /* input */, v2 /* output */,
+    v1.data() /* input */, v2.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #1" << std::endl;
@@ -2200,7 +2201,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op2,
     1 /* batch size */, 112 /* input height */, 112 /* input width */,
-    v2 /* input */, v3 /* output */,
+    v2.data() /* input */, v3.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #2" << std::endl;
@@ -2210,7 +2211,7 @@
   status = xnn_setup_global_average_pooling_nwc_f16(
     op3,
     1 /* batch size */, 3136 /* width */,
-    v3 /* input */, v4 /* output */,
+    v3.data() /* input */, v4.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #3" << std::endl;
@@ -2220,7 +2221,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op4,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v4 /* input */, v5 /* output */,
+    v4.data() /* input */, v5.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #4" << std::endl;
@@ -2230,7 +2231,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op5,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v5 /* input */, v6 /* output */,
+    v5.data() /* input */, v6.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #5" << std::endl;
@@ -2243,7 +2244,7 @@
     status = xnn_setup_multiply_nd_f16(
       op6,
       4, a_shape, 4, b_shape,
-      v3 /* a */, v6 /* b */, v7 /* output */,
+      v3.data() /* a */, v6.data() /* b */, v7.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2254,7 +2255,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op7,
     1 /* batch size */, 56 /* input height */, 56 /* input width */,
-    v7 /* input */, v8 /* output */,
+    v7.data() /* input */, v8.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #7" << std::endl;
@@ -2264,7 +2265,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op8,
     1 /* batch size */, 56 /* input height */, 56 /* input width */,
-    v8 /* input */, v9 /* output */,
+    v8.data() /* input */, v9.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #8" << std::endl;
@@ -2274,7 +2275,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op9,
     1 /* batch size */, 56 /* input height */, 56 /* input width */,
-    v9 /* input */, v10 /* output */,
+    v9.data() /* input */, v10.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #9" << std::endl;
@@ -2284,7 +2285,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op10,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v10 /* input */, v11 /* output */,
+    v10.data() /* input */, v11.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #10" << std::endl;
@@ -2294,7 +2295,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op11,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v11 /* input */, v12 /* output */,
+    v11.data() /* input */, v12.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #11" << std::endl;
@@ -2304,7 +2305,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op12,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v12 /* input */, v13 /* output */,
+    v12.data() /* input */, v13.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #12" << std::endl;
@@ -2314,7 +2315,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op13,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v13 /* input */, v14 /* output */,
+    v13.data() /* input */, v14.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #13" << std::endl;
@@ -2327,7 +2328,7 @@
     status = xnn_setup_add_nd_f16(
       op14,
       4, a_shape, 4, b_shape,
-      v14 /* a */, v11 /* b */, v15 /* output */,
+      v14.data() /* a */, v11.data() /* b */, v15.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2338,7 +2339,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op15,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v15 /* input */, v16 /* output */,
+    v15.data() /* input */, v16.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #15" << std::endl;
@@ -2348,7 +2349,7 @@
   status = xnn_setup_hardswish_nc_f16(
     op16,
     784 /* batch size */,
-    v16 /* input */, v17 /* output */,
+    v16.data() /* input */, v17.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #16" << std::endl;
@@ -2358,7 +2359,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op17,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v17 /* input */, v18 /* output */,
+    v17.data() /* input */, v18.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #17" << std::endl;
@@ -2368,7 +2369,7 @@
   status = xnn_setup_hardswish_nc_f16(
     op18,
     196 /* batch size */,
-    v18 /* input */, v19 /* output */,
+    v18.data() /* input */, v19.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #18" << std::endl;
@@ -2378,7 +2379,7 @@
   status = xnn_setup_global_average_pooling_nwc_f16(
     op19,
     1 /* batch size */, 196 /* width */,
-    v19 /* input */, v20 /* output */,
+    v19.data() /* input */, v20.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #19" << std::endl;
@@ -2388,7 +2389,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op20,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v20 /* input */, v21 /* output */,
+    v20.data() /* input */, v21.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #20" << std::endl;
@@ -2398,7 +2399,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op21,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v21 /* input */, v22 /* output */,
+    v21.data() /* input */, v22.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #21" << std::endl;
@@ -2411,7 +2412,7 @@
     status = xnn_setup_multiply_nd_f16(
       op22,
       4, a_shape, 4, b_shape,
-      v19 /* a */, v22 /* b */, v23 /* output */,
+      v19.data() /* a */, v22.data() /* b */, v23.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2422,7 +2423,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op23,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v23 /* input */, v24 /* output */,
+    v23.data() /* input */, v24.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #23" << std::endl;
@@ -2432,7 +2433,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op24,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v24 /* input */, v25 /* output */,
+    v24.data() /* input */, v25.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #24" << std::endl;
@@ -2442,7 +2443,7 @@
   status = xnn_setup_hardswish_nc_f16(
     op25,
     196 /* batch size */,
-    v25 /* input */, v26 /* output */,
+    v25.data() /* input */, v26.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #25" << std::endl;
@@ -2452,7 +2453,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op26,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v26 /* input */, v27 /* output */,
+    v26.data() /* input */, v27.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #26" << std::endl;
@@ -2462,7 +2463,7 @@
   status = xnn_setup_hardswish_nc_f16(
     op27,
     196 /* batch size */,
-    v27 /* input */, v28 /* output */,
+    v27.data() /* input */, v28.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #27" << std::endl;
@@ -2472,7 +2473,7 @@
   status = xnn_setup_global_average_pooling_nwc_f16(
     op28,
     1 /* batch size */, 196 /* width */,
-    v28 /* input */, v29 /* output */,
+    v28.data() /* input */, v29.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #28" << std::endl;
@@ -2482,7 +2483,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op29,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v29 /* input */, v30 /* output */,
+    v29.data() /* input */, v30.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #29" << std::endl;
@@ -2492,7 +2493,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op30,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v30 /* input */, v31 /* output */,
+    v30.data() /* input */, v31.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #30" << std::endl;
@@ -2505,7 +2506,7 @@
     status = xnn_setup_multiply_nd_f16(
       op31,
       4, a_shape, 4, b_shape,
-      v28 /* a */, v31 /* b */, v32 /* output */,
+      v28.data() /* a */, v31.data() /* b */, v32.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2516,7 +2517,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op32,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v32 /* input */, v33 /* output */,
+    v32.data() /* input */, v33.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #32" << std::endl;
@@ -2529,7 +2530,7 @@
     status = xnn_setup_add_nd_f16(
       op33,
       4, a_shape, 4, b_shape,
-      v33 /* a */, v24 /* b */, v34 /* output */,
+      v33.data() /* a */, v24.data() /* b */, v34.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2540,7 +2541,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op34,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v34 /* input */, v35 /* output */,
+    v34.data() /* input */, v35.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #34" << std::endl;
@@ -2550,7 +2551,7 @@
   status = xnn_setup_hardswish_nc_f16(
     op35,
     196 /* batch size */,
-    v35 /* input */, v36 /* output */,
+    v35.data() /* input */, v36.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #35" << std::endl;
@@ -2560,7 +2561,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op36,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v36 /* input */, v37 /* output */,
+    v36.data() /* input */, v37.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #36" << std::endl;
@@ -2570,7 +2571,7 @@
   status = xnn_setup_hardswish_nc_f16(
     op37,
     196 /* batch size */,
-    v37 /* input */, v38 /* output */,
+    v37.data() /* input */, v38.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #37" << std::endl;
@@ -2580,7 +2581,7 @@
   status = xnn_setup_global_average_pooling_nwc_f16(
     op38,
     1 /* batch size */, 196 /* width */,
-    v38 /* input */, v39 /* output */,
+    v38.data() /* input */, v39.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #38" << std::endl;
@@ -2590,7 +2591,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op39,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v39 /* input */, v40 /* output */,
+    v39.data() /* input */, v40.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #39" << std::endl;
@@ -2600,7 +2601,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op40,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v40 /* input */, v41 /* output */,
+    v40.data() /* input */, v41.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #40" << std::endl;
@@ -2613,7 +2614,7 @@
     status = xnn_setup_multiply_nd_f16(
       op41,
       4, a_shape, 4, b_shape,
-      v38 /* a */, v41 /* b */, v42 /* output */,
+      v38.data() /* a */, v41.data() /* b */, v42.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2624,7 +2625,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op42,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v42 /* input */, v43 /* output */,
+    v42.data() /* input */, v43.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #42" << std::endl;
@@ -2637,7 +2638,7 @@
     status = xnn_setup_add_nd_f16(
       op43,
       4, a_shape, 4, b_shape,
-      v43 /* a */, v34 /* b */, v44 /* output */,
+      v43.data() /* a */, v34.data() /* b */, v44.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2648,7 +2649,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op44,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v44 /* input */, v45 /* output */,
+    v44.data() /* input */, v45.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #44" << std::endl;
@@ -2658,7 +2659,7 @@
   status = xnn_setup_hardswish_nc_f16(
     op45,
     196 /* batch size */,
-    v45 /* input */, v46 /* output */,
+    v45.data() /* input */, v46.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #45" << std::endl;
@@ -2668,7 +2669,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op46,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v46 /* input */, v47 /* output */,
+    v46.data() /* input */, v47.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #46" << std::endl;
@@ -2678,7 +2679,7 @@
   status = xnn_setup_hardswish_nc_f16(
     op47,
     196 /* batch size */,
-    v47 /* input */, v48 /* output */,
+    v47.data() /* input */, v48.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #47" << std::endl;
@@ -2688,7 +2689,7 @@
   status = xnn_setup_global_average_pooling_nwc_f16(
     op48,
     1 /* batch size */, 196 /* width */,
-    v48 /* input */, v49 /* output */,
+    v48.data() /* input */, v49.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #48" << std::endl;
@@ -2698,7 +2699,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op49,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v49 /* input */, v50 /* output */,
+    v49.data() /* input */, v50.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #49" << std::endl;
@@ -2708,7 +2709,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op50,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v50 /* input */, v51 /* output */,
+    v50.data() /* input */, v51.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #50" << std::endl;
@@ -2721,7 +2722,7 @@
     status = xnn_setup_multiply_nd_f16(
       op51,
       4, a_shape, 4, b_shape,
-      v48 /* a */, v51 /* b */, v52 /* output */,
+      v48.data() /* a */, v51.data() /* b */, v52.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2732,7 +2733,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op52,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v52 /* input */, v53 /* output */,
+    v52.data() /* input */, v53.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #52" << std::endl;
@@ -2742,7 +2743,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op53,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v53 /* input */, v54 /* output */,
+    v53.data() /* input */, v54.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #53" << std::endl;
@@ -2752,7 +2753,7 @@
   status = xnn_setup_hardswish_nc_f16(
     op54,
     196 /* batch size */,
-    v54 /* input */, v55 /* output */,
+    v54.data() /* input */, v55.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #54" << std::endl;
@@ -2762,7 +2763,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op55,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v55 /* input */, v56 /* output */,
+    v55.data() /* input */, v56.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #55" << std::endl;
@@ -2772,7 +2773,7 @@
   status = xnn_setup_hardswish_nc_f16(
     op56,
     196 /* batch size */,
-    v56 /* input */, v57 /* output */,
+    v56.data() /* input */, v57.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #56" << std::endl;
@@ -2782,7 +2783,7 @@
   status = xnn_setup_global_average_pooling_nwc_f16(
     op57,
     1 /* batch size */, 196 /* width */,
-    v57 /* input */, v58 /* output */,
+    v57.data() /* input */, v58.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #57" << std::endl;
@@ -2792,7 +2793,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op58,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v58 /* input */, v59 /* output */,
+    v58.data() /* input */, v59.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #58" << std::endl;
@@ -2802,7 +2803,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op59,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v59 /* input */, v60 /* output */,
+    v59.data() /* input */, v60.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #59" << std::endl;
@@ -2815,7 +2816,7 @@
     status = xnn_setup_multiply_nd_f16(
       op60,
       4, a_shape, 4, b_shape,
-      v57 /* a */, v60 /* b */, v61 /* output */,
+      v57.data() /* a */, v60.data() /* b */, v61.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2826,7 +2827,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op61,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v61 /* input */, v62 /* output */,
+    v61.data() /* input */, v62.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #61" << std::endl;
@@ -2839,7 +2840,7 @@
     status = xnn_setup_add_nd_f16(
       op62,
       4, a_shape, 4, b_shape,
-      v62 /* a */, v53 /* b */, v63 /* output */,
+      v62.data() /* a */, v53.data() /* b */, v63.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2850,7 +2851,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op63,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v63 /* input */, v64 /* output */,
+    v63.data() /* input */, v64.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #63" << std::endl;
@@ -2860,7 +2861,7 @@
   status = xnn_setup_hardswish_nc_f16(
     op64,
     196 /* batch size */,
-    v64 /* input */, v65 /* output */,
+    v64.data() /* input */, v65.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #64" << std::endl;
@@ -2870,7 +2871,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op65,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v65 /* input */, v66 /* output */,
+    v65.data() /* input */, v66.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #65" << std::endl;
@@ -2880,7 +2881,7 @@
   status = xnn_setup_hardswish_nc_f16(
     op66,
     49 /* batch size */,
-    v66 /* input */, v67 /* output */,
+    v66.data() /* input */, v67.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #66" << std::endl;
@@ -2890,7 +2891,7 @@
   status = xnn_setup_global_average_pooling_nwc_f16(
     op67,
     1 /* batch size */, 49 /* width */,
-    v67 /* input */, v68 /* output */,
+    v67.data() /* input */, v68.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #67" << std::endl;
@@ -2900,7 +2901,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op68,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v68 /* input */, v69 /* output */,
+    v68.data() /* input */, v69.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #68" << std::endl;
@@ -2910,7 +2911,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op69,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v69 /* input */, v70 /* output */,
+    v69.data() /* input */, v70.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #69" << std::endl;
@@ -2923,7 +2924,7 @@
     status = xnn_setup_multiply_nd_f16(
       op70,
       4, a_shape, 4, b_shape,
-      v67 /* a */, v70 /* b */, v71 /* output */,
+      v67.data() /* a */, v70.data() /* b */, v71.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2934,7 +2935,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op71,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v71 /* input */, v72 /* output */,
+    v71.data() /* input */, v72.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #71" << std::endl;
@@ -2944,7 +2945,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op72,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v72 /* input */, v73 /* output */,
+    v72.data() /* input */, v73.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #72" << std::endl;
@@ -2954,7 +2955,7 @@
   status = xnn_setup_hardswish_nc_f16(
     op73,
     49 /* batch size */,
-    v73 /* input */, v74 /* output */,
+    v73.data() /* input */, v74.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #73" << std::endl;
@@ -2964,7 +2965,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op74,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v74 /* input */, v75 /* output */,
+    v74.data() /* input */, v75.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #74" << std::endl;
@@ -2974,7 +2975,7 @@
   status = xnn_setup_hardswish_nc_f16(
     op75,
     49 /* batch size */,
-    v75 /* input */, v76 /* output */,
+    v75.data() /* input */, v76.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #75" << std::endl;
@@ -2984,7 +2985,7 @@
   status = xnn_setup_global_average_pooling_nwc_f16(
     op76,
     1 /* batch size */, 49 /* width */,
-    v76 /* input */, v77 /* output */,
+    v76.data() /* input */, v77.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #76" << std::endl;
@@ -2994,7 +2995,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op77,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v77 /* input */, v78 /* output */,
+    v77.data() /* input */, v78.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #77" << std::endl;
@@ -3004,7 +3005,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op78,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v78 /* input */, v79 /* output */,
+    v78.data() /* input */, v79.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #78" << std::endl;
@@ -3017,7 +3018,7 @@
     status = xnn_setup_multiply_nd_f16(
       op79,
       4, a_shape, 4, b_shape,
-      v76 /* a */, v79 /* b */, v80 /* output */,
+      v76.data() /* a */, v79.data() /* b */, v80.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -3028,7 +3029,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op80,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v80 /* input */, v81 /* output */,
+    v80.data() /* input */, v81.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #80" << std::endl;
@@ -3041,7 +3042,7 @@
     status = xnn_setup_add_nd_f16(
       op81,
       4, a_shape, 4, b_shape,
-      v81 /* a */, v72 /* b */, v82 /* output */,
+      v81.data() /* a */, v72.data() /* b */, v82.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -3052,7 +3053,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op82,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v82 /* input */, v83 /* output */,
+    v82.data() /* input */, v83.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #82" << std::endl;
@@ -3062,7 +3063,7 @@
   status = xnn_setup_hardswish_nc_f16(
     op83,
     49 /* batch size */,
-    v83 /* input */, v84 /* output */,
+    v83.data() /* input */, v84.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #83" << std::endl;
@@ -3072,7 +3073,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op84,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v84 /* input */, v85 /* output */,
+    v84.data() /* input */, v85.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #84" << std::endl;
@@ -3082,7 +3083,7 @@
   status = xnn_setup_hardswish_nc_f16(
     op85,
     49 /* batch size */,
-    v85 /* input */, v86 /* output */,
+    v85.data() /* input */, v86.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #85" << std::endl;
@@ -3092,7 +3093,7 @@
   status = xnn_setup_global_average_pooling_nwc_f16(
     op86,
     1 /* batch size */, 49 /* width */,
-    v86 /* input */, v87 /* output */,
+    v86.data() /* input */, v87.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #86" << std::endl;
@@ -3102,7 +3103,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op87,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v87 /* input */, v88 /* output */,
+    v87.data() /* input */, v88.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #87" << std::endl;
@@ -3112,7 +3113,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op88,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v88 /* input */, v89 /* output */,
+    v88.data() /* input */, v89.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #88" << std::endl;
@@ -3125,7 +3126,7 @@
     status = xnn_setup_multiply_nd_f16(
       op89,
       4, a_shape, 4, b_shape,
-      v86 /* a */, v89 /* b */, v90 /* output */,
+      v86.data() /* a */, v89.data() /* b */, v90.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -3136,7 +3137,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op90,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v90 /* input */, v91 /* output */,
+    v90.data() /* input */, v91.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #90" << std::endl;
@@ -3149,7 +3150,7 @@
     status = xnn_setup_add_nd_f16(
       op91,
       4, a_shape, 4, b_shape,
-      v91 /* a */, v82 /* b */, v92 /* output */,
+      v91.data() /* a */, v82.data() /* b */, v92.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -3160,7 +3161,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op92,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v92 /* input */, v93 /* output */,
+    v92.data() /* input */, v93.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #92" << std::endl;
@@ -3170,7 +3171,7 @@
   status = xnn_setup_hardswish_nc_f16(
     op93,
     49 /* batch size */,
-    v93 /* input */, v94 /* output */,
+    v93.data() /* input */, v94.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #93" << std::endl;
@@ -3180,7 +3181,7 @@
   status = xnn_setup_global_average_pooling_nwc_f16(
     op94,
     1 /* batch size */, 49 /* width */,
-    v94 /* input */, v95 /* output */,
+    v94.data() /* input */, v95.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #94" << std::endl;
@@ -3190,7 +3191,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op95,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v95 /* input */, v96 /* output */,
+    v95.data() /* input */, v96.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #95" << std::endl;
@@ -3200,7 +3201,7 @@
   status = xnn_setup_hardswish_nc_f16(
     op96,
     1 /* batch size */,
-    v96 /* input */, v97 /* output */,
+    v96.data() /* input */, v97.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #96" << std::endl;
@@ -3210,7 +3211,7 @@
   status = xnn_setup_global_average_pooling_nwc_f16(
     op97,
     1 /* batch size */, 1 /* width */,
-    v97 /* input */, v98 /* output */,
+    v97.data() /* input */, v98.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #97" << std::endl;
@@ -3220,7 +3221,7 @@
   status = xnn_setup_convolution2d_nhwc_f16(
     op98,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v98 /* input */, v99 /* output */,
+    v98.data() /* input */, v99.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #98" << std::endl;
diff --git a/models/fp32-mobilenet-v1.cc b/models/fp32-mobilenet-v1.cc
index 212ce9d..011868f 100644
--- a/models/fp32-mobilenet-v1.cc
+++ b/models/fp32-mobilenet-v1.cc
@@ -5,6 +5,7 @@
 
 #include <xnnpack.h>
 
+#include <array>
 #include <algorithm>
 #include <functional>
 #include <iostream>
@@ -16,182 +17,182 @@
 namespace models {
 
 ExecutionPlan FP32MobileNetV1(pthreadpool_t threadpool) {
-  alignas(16) static float v0[150528];
-  alignas(16) static float v1[401408];
-  alignas(16) static float v2[401408];
-  alignas(16) static float v3[802816];
-  alignas(16) static float v4[200704];
-  alignas(16) static float v5[401408];
-  alignas(16) static float v6[401408];
-  alignas(16) static float v7[401408];
-  alignas(16) static float v8[100352];
-  alignas(16) static float v9[200704];
-  alignas(16) static float v10[200704];
-  alignas(16) static float v11[200704];
-  alignas(16) static float v12[50176];
-  alignas(16) static float v13[100352];
-  alignas(16) static float v14[100352];
-  alignas(16) static float v15[100352];
-  alignas(16) static float v16[100352];
-  alignas(16) static float v17[100352];
-  alignas(16) static float v18[100352];
-  alignas(16) static float v19[100352];
-  alignas(16) static float v20[100352];
-  alignas(16) static float v21[100352];
-  alignas(16) static float v22[100352];
-  alignas(16) static float v23[100352];
-  alignas(16) static float v24[25088];
-  alignas(16) static float v25[50176];
-  alignas(16) static float v26[50176];
-  alignas(16) static float v27[50176];
-  alignas(16) static float v28[1024];
-  alignas(16) static float v29[1001];
-  alignas(16) static float w30[864];
-  alignas(16) static float w31[32];
-  alignas(16) static float w32[288];
-  alignas(16) static float w33[32];
-  alignas(16) static float w34[2048];
-  alignas(16) static float w35[64];
-  alignas(16) static float w36[576];
-  alignas(16) static float w37[64];
-  alignas(16) static float w38[8192];
-  alignas(16) static float w39[128];
-  alignas(16) static float w40[1152];
-  alignas(16) static float w41[128];
-  alignas(16) static float w42[16384];
-  alignas(16) static float w43[128];
-  alignas(16) static float w44[1152];
-  alignas(16) static float w45[128];
-  alignas(16) static float w46[32768];
-  alignas(16) static float w47[256];
-  alignas(16) static float w48[2304];
-  alignas(16) static float w49[256];
-  alignas(16) static float w50[65536];
-  alignas(16) static float w51[256];
-  alignas(16) static float w52[2304];
-  alignas(16) static float w53[256];
-  alignas(16) static float w54[131072];
-  alignas(16) static float w55[512];
-  alignas(16) static float w56[4608];
-  alignas(16) static float w57[512];
-  alignas(16) static float w58[262144];
-  alignas(16) static float w59[512];
-  alignas(16) static float w60[4608];
-  alignas(16) static float w61[512];
-  alignas(16) static float w62[262144];
-  alignas(16) static float w63[512];
-  alignas(16) static float w64[4608];
-  alignas(16) static float w65[512];
-  alignas(16) static float w66[262144];
-  alignas(16) static float w67[512];
-  alignas(16) static float w68[4608];
-  alignas(16) static float w69[512];
-  alignas(16) static float w70[262144];
-  alignas(16) static float w71[512];
-  alignas(16) static float w72[4608];
-  alignas(16) static float w73[512];
-  alignas(16) static float w74[262144];
-  alignas(16) static float w75[512];
-  alignas(16) static float w76[4608];
-  alignas(16) static float w77[512];
-  alignas(16) static float w78[524288];
-  alignas(16) static float w79[1024];
-  alignas(16) static float w80[9216];
-  alignas(16) static float w81[1024];
-  alignas(16) static float w82[1048576];
-  alignas(16) static float w83[1024];
-  alignas(16) static float w84[1025024];
-  alignas(16) static float w85[1001];
+  alignas(16) static std::array<float, 150528> v0;
+  alignas(16) static std::array<float, 401408> v1;
+  alignas(16) static std::array<float, 401408> v2;
+  alignas(16) static std::array<float, 802816> v3;
+  alignas(16) static std::array<float, 200704> v4;
+  alignas(16) static std::array<float, 401408> v5;
+  alignas(16) static std::array<float, 401408> v6;
+  alignas(16) static std::array<float, 401408> v7;
+  alignas(16) static std::array<float, 100352> v8;
+  alignas(16) static std::array<float, 200704> v9;
+  alignas(16) static std::array<float, 200704> v10;
+  alignas(16) static std::array<float, 200704> v11;
+  alignas(16) static std::array<float, 50176> v12;
+  alignas(16) static std::array<float, 100352> v13;
+  alignas(16) static std::array<float, 100352> v14;
+  alignas(16) static std::array<float, 100352> v15;
+  alignas(16) static std::array<float, 100352> v16;
+  alignas(16) static std::array<float, 100352> v17;
+  alignas(16) static std::array<float, 100352> v18;
+  alignas(16) static std::array<float, 100352> v19;
+  alignas(16) static std::array<float, 100352> v20;
+  alignas(16) static std::array<float, 100352> v21;
+  alignas(16) static std::array<float, 100352> v22;
+  alignas(16) static std::array<float, 100352> v23;
+  alignas(16) static std::array<float, 25088> v24;
+  alignas(16) static std::array<float, 50176> v25;
+  alignas(16) static std::array<float, 50176> v26;
+  alignas(16) static std::array<float, 50176> v27;
+  alignas(16) static std::array<float, 1024> v28;
+  alignas(16) static std::array<float, 1001> v29;
+  alignas(16) static std::array<float, 864> w30;
+  alignas(16) static std::array<float, 32> w31;
+  alignas(16) static std::array<float, 288> w32;
+  alignas(16) static std::array<float, 32> w33;
+  alignas(16) static std::array<float, 2048> w34;
+  alignas(16) static std::array<float, 64> w35;
+  alignas(16) static std::array<float, 576> w36;
+  alignas(16) static std::array<float, 64> w37;
+  alignas(16) static std::array<float, 8192> w38;
+  alignas(16) static std::array<float, 128> w39;
+  alignas(16) static std::array<float, 1152> w40;
+  alignas(16) static std::array<float, 128> w41;
+  alignas(16) static std::array<float, 16384> w42;
+  alignas(16) static std::array<float, 128> w43;
+  alignas(16) static std::array<float, 1152> w44;
+  alignas(16) static std::array<float, 128> w45;
+  alignas(16) static std::array<float, 32768> w46;
+  alignas(16) static std::array<float, 256> w47;
+  alignas(16) static std::array<float, 2304> w48;
+  alignas(16) static std::array<float, 256> w49;
+  alignas(16) static std::array<float, 65536> w50;
+  alignas(16) static std::array<float, 256> w51;
+  alignas(16) static std::array<float, 2304> w52;
+  alignas(16) static std::array<float, 256> w53;
+  alignas(16) static std::array<float, 131072> w54;
+  alignas(16) static std::array<float, 512> w55;
+  alignas(16) static std::array<float, 4608> w56;
+  alignas(16) static std::array<float, 512> w57;
+  alignas(16) static std::array<float, 262144> w58;
+  alignas(16) static std::array<float, 512> w59;
+  alignas(16) static std::array<float, 4608> w60;
+  alignas(16) static std::array<float, 512> w61;
+  alignas(16) static std::array<float, 262144> w62;
+  alignas(16) static std::array<float, 512> w63;
+  alignas(16) static std::array<float, 4608> w64;
+  alignas(16) static std::array<float, 512> w65;
+  alignas(16) static std::array<float, 262144> w66;
+  alignas(16) static std::array<float, 512> w67;
+  alignas(16) static std::array<float, 4608> w68;
+  alignas(16) static std::array<float, 512> w69;
+  alignas(16) static std::array<float, 262144> w70;
+  alignas(16) static std::array<float, 512> w71;
+  alignas(16) static std::array<float, 4608> w72;
+  alignas(16) static std::array<float, 512> w73;
+  alignas(16) static std::array<float, 262144> w74;
+  alignas(16) static std::array<float, 512> w75;
+  alignas(16) static std::array<float, 4608> w76;
+  alignas(16) static std::array<float, 512> w77;
+  alignas(16) static std::array<float, 524288> w78;
+  alignas(16) static std::array<float, 1024> w79;
+  alignas(16) static std::array<float, 9216> w80;
+  alignas(16) static std::array<float, 1024> w81;
+  alignas(16) static std::array<float, 1048576> w82;
+  alignas(16) static std::array<float, 1024> w83;
+  alignas(16) static std::array<float, 1025024> w84;
+  alignas(16) static std::array<float, 1001> w85;
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.0f, +1.0f), std::ref(rng));
-  std::generate(v0, v0 + 150528, std::ref(f32rng));
-  std::generate(v1, v1 + 401408, std::ref(f32rng));
-  std::generate(v2, v2 + 401408, std::ref(f32rng));
-  std::generate(v3, v3 + 802816, std::ref(f32rng));
-  std::generate(v4, v4 + 200704, std::ref(f32rng));
-  std::generate(v5, v5 + 401408, std::ref(f32rng));
-  std::generate(v6, v6 + 401408, std::ref(f32rng));
-  std::generate(v7, v7 + 401408, std::ref(f32rng));
-  std::generate(v8, v8 + 100352, std::ref(f32rng));
-  std::generate(v9, v9 + 200704, std::ref(f32rng));
-  std::generate(v10, v10 + 200704, std::ref(f32rng));
-  std::generate(v11, v11 + 200704, std::ref(f32rng));
-  std::generate(v12, v12 + 50176, std::ref(f32rng));
-  std::generate(v13, v13 + 100352, std::ref(f32rng));
-  std::generate(v14, v14 + 100352, std::ref(f32rng));
-  std::generate(v15, v15 + 100352, std::ref(f32rng));
-  std::generate(v16, v16 + 100352, std::ref(f32rng));
-  std::generate(v17, v17 + 100352, std::ref(f32rng));
-  std::generate(v18, v18 + 100352, std::ref(f32rng));
-  std::generate(v19, v19 + 100352, std::ref(f32rng));
-  std::generate(v20, v20 + 100352, std::ref(f32rng));
-  std::generate(v21, v21 + 100352, std::ref(f32rng));
-  std::generate(v22, v22 + 100352, std::ref(f32rng));
-  std::generate(v23, v23 + 100352, std::ref(f32rng));
-  std::generate(v24, v24 + 25088, std::ref(f32rng));
-  std::generate(v25, v25 + 50176, std::ref(f32rng));
-  std::generate(v26, v26 + 50176, std::ref(f32rng));
-  std::generate(v27, v27 + 50176, std::ref(f32rng));
-  std::generate(v28, v28 + 1024, std::ref(f32rng));
-  std::generate(v29, v29 + 1001, std::ref(f32rng));
-  std::generate(w30, w30 + 864, std::ref(f32rng));
-  std::generate(w31, w31 + 32, std::ref(f32rng));
-  std::generate(w32, w32 + 288, std::ref(f32rng));
-  std::generate(w33, w33 + 32, std::ref(f32rng));
-  std::generate(w34, w34 + 2048, std::ref(f32rng));
-  std::generate(w35, w35 + 64, std::ref(f32rng));
-  std::generate(w36, w36 + 576, std::ref(f32rng));
-  std::generate(w37, w37 + 64, std::ref(f32rng));
-  std::generate(w38, w38 + 8192, std::ref(f32rng));
-  std::generate(w39, w39 + 128, std::ref(f32rng));
-  std::generate(w40, w40 + 1152, std::ref(f32rng));
-  std::generate(w41, w41 + 128, std::ref(f32rng));
-  std::generate(w42, w42 + 16384, std::ref(f32rng));
-  std::generate(w43, w43 + 128, std::ref(f32rng));
-  std::generate(w44, w44 + 1152, std::ref(f32rng));
-  std::generate(w45, w45 + 128, std::ref(f32rng));
-  std::generate(w46, w46 + 32768, std::ref(f32rng));
-  std::generate(w47, w47 + 256, std::ref(f32rng));
-  std::generate(w48, w48 + 2304, std::ref(f32rng));
-  std::generate(w49, w49 + 256, std::ref(f32rng));
-  std::generate(w50, w50 + 65536, std::ref(f32rng));
-  std::generate(w51, w51 + 256, std::ref(f32rng));
-  std::generate(w52, w52 + 2304, std::ref(f32rng));
-  std::generate(w53, w53 + 256, std::ref(f32rng));
-  std::generate(w54, w54 + 131072, std::ref(f32rng));
-  std::generate(w55, w55 + 512, std::ref(f32rng));
-  std::generate(w56, w56 + 4608, std::ref(f32rng));
-  std::generate(w57, w57 + 512, std::ref(f32rng));
-  std::generate(w58, w58 + 262144, std::ref(f32rng));
-  std::generate(w59, w59 + 512, std::ref(f32rng));
-  std::generate(w60, w60 + 4608, std::ref(f32rng));
-  std::generate(w61, w61 + 512, std::ref(f32rng));
-  std::generate(w62, w62 + 262144, std::ref(f32rng));
-  std::generate(w63, w63 + 512, std::ref(f32rng));
-  std::generate(w64, w64 + 4608, std::ref(f32rng));
-  std::generate(w65, w65 + 512, std::ref(f32rng));
-  std::generate(w66, w66 + 262144, std::ref(f32rng));
-  std::generate(w67, w67 + 512, std::ref(f32rng));
-  std::generate(w68, w68 + 4608, std::ref(f32rng));
-  std::generate(w69, w69 + 512, std::ref(f32rng));
-  std::generate(w70, w70 + 262144, std::ref(f32rng));
-  std::generate(w71, w71 + 512, std::ref(f32rng));
-  std::generate(w72, w72 + 4608, std::ref(f32rng));
-  std::generate(w73, w73 + 512, std::ref(f32rng));
-  std::generate(w74, w74 + 262144, std::ref(f32rng));
-  std::generate(w75, w75 + 512, std::ref(f32rng));
-  std::generate(w76, w76 + 4608, std::ref(f32rng));
-  std::generate(w77, w77 + 512, std::ref(f32rng));
-  std::generate(w78, w78 + 524288, std::ref(f32rng));
-  std::generate(w79, w79 + 1024, std::ref(f32rng));
-  std::generate(w80, w80 + 9216, std::ref(f32rng));
-  std::generate(w81, w81 + 1024, std::ref(f32rng));
-  std::generate(w82, w82 + 1048576, std::ref(f32rng));
-  std::generate(w83, w83 + 1024, std::ref(f32rng));
-  std::generate(w84, w84 + 1025024, std::ref(f32rng));
-  std::generate(w85, w85 + 1001, std::ref(f32rng));
+  std::generate(v0.begin(), v0.end(), std::ref(f32rng));
+  std::generate(v1.begin(), v1.end(), std::ref(f32rng));
+  std::generate(v2.begin(), v2.end(), std::ref(f32rng));
+  std::generate(v3.begin(), v3.end(), std::ref(f32rng));
+  std::generate(v4.begin(), v4.end(), std::ref(f32rng));
+  std::generate(v5.begin(), v5.end(), std::ref(f32rng));
+  std::generate(v6.begin(), v6.end(), std::ref(f32rng));
+  std::generate(v7.begin(), v7.end(), std::ref(f32rng));
+  std::generate(v8.begin(), v8.end(), std::ref(f32rng));
+  std::generate(v9.begin(), v9.end(), std::ref(f32rng));
+  std::generate(v10.begin(), v10.end(), std::ref(f32rng));
+  std::generate(v11.begin(), v11.end(), std::ref(f32rng));
+  std::generate(v12.begin(), v12.end(), std::ref(f32rng));
+  std::generate(v13.begin(), v13.end(), std::ref(f32rng));
+  std::generate(v14.begin(), v14.end(), std::ref(f32rng));
+  std::generate(v15.begin(), v15.end(), std::ref(f32rng));
+  std::generate(v16.begin(), v16.end(), std::ref(f32rng));
+  std::generate(v17.begin(), v17.end(), std::ref(f32rng));
+  std::generate(v18.begin(), v18.end(), std::ref(f32rng));
+  std::generate(v19.begin(), v19.end(), std::ref(f32rng));
+  std::generate(v20.begin(), v20.end(), std::ref(f32rng));
+  std::generate(v21.begin(), v21.end(), std::ref(f32rng));
+  std::generate(v22.begin(), v22.end(), std::ref(f32rng));
+  std::generate(v23.begin(), v23.end(), std::ref(f32rng));
+  std::generate(v24.begin(), v24.end(), std::ref(f32rng));
+  std::generate(v25.begin(), v25.end(), std::ref(f32rng));
+  std::generate(v26.begin(), v26.end(), std::ref(f32rng));
+  std::generate(v27.begin(), v27.end(), std::ref(f32rng));
+  std::generate(v28.begin(), v28.end(), std::ref(f32rng));
+  std::generate(v29.begin(), v29.end(), std::ref(f32rng));
+  std::generate(w30.begin(), w30.end(), std::ref(f32rng));
+  std::generate(w31.begin(), w31.end(), std::ref(f32rng));
+  std::generate(w32.begin(), w32.end(), std::ref(f32rng));
+  std::generate(w33.begin(), w33.end(), std::ref(f32rng));
+  std::generate(w34.begin(), w34.end(), std::ref(f32rng));
+  std::generate(w35.begin(), w35.end(), std::ref(f32rng));
+  std::generate(w36.begin(), w36.end(), std::ref(f32rng));
+  std::generate(w37.begin(), w37.end(), std::ref(f32rng));
+  std::generate(w38.begin(), w38.end(), std::ref(f32rng));
+  std::generate(w39.begin(), w39.end(), std::ref(f32rng));
+  std::generate(w40.begin(), w40.end(), std::ref(f32rng));
+  std::generate(w41.begin(), w41.end(), std::ref(f32rng));
+  std::generate(w42.begin(), w42.end(), std::ref(f32rng));
+  std::generate(w43.begin(), w43.end(), std::ref(f32rng));
+  std::generate(w44.begin(), w44.end(), std::ref(f32rng));
+  std::generate(w45.begin(), w45.end(), std::ref(f32rng));
+  std::generate(w46.begin(), w46.end(), std::ref(f32rng));
+  std::generate(w47.begin(), w47.end(), std::ref(f32rng));
+  std::generate(w48.begin(), w48.end(), std::ref(f32rng));
+  std::generate(w49.begin(), w49.end(), std::ref(f32rng));
+  std::generate(w50.begin(), w50.end(), std::ref(f32rng));
+  std::generate(w51.begin(), w51.end(), std::ref(f32rng));
+  std::generate(w52.begin(), w52.end(), std::ref(f32rng));
+  std::generate(w53.begin(), w53.end(), std::ref(f32rng));
+  std::generate(w54.begin(), w54.end(), std::ref(f32rng));
+  std::generate(w55.begin(), w55.end(), std::ref(f32rng));
+  std::generate(w56.begin(), w56.end(), std::ref(f32rng));
+  std::generate(w57.begin(), w57.end(), std::ref(f32rng));
+  std::generate(w58.begin(), w58.end(), std::ref(f32rng));
+  std::generate(w59.begin(), w59.end(), std::ref(f32rng));
+  std::generate(w60.begin(), w60.end(), std::ref(f32rng));
+  std::generate(w61.begin(), w61.end(), std::ref(f32rng));
+  std::generate(w62.begin(), w62.end(), std::ref(f32rng));
+  std::generate(w63.begin(), w63.end(), std::ref(f32rng));
+  std::generate(w64.begin(), w64.end(), std::ref(f32rng));
+  std::generate(w65.begin(), w65.end(), std::ref(f32rng));
+  std::generate(w66.begin(), w66.end(), std::ref(f32rng));
+  std::generate(w67.begin(), w67.end(), std::ref(f32rng));
+  std::generate(w68.begin(), w68.end(), std::ref(f32rng));
+  std::generate(w69.begin(), w69.end(), std::ref(f32rng));
+  std::generate(w70.begin(), w70.end(), std::ref(f32rng));
+  std::generate(w71.begin(), w71.end(), std::ref(f32rng));
+  std::generate(w72.begin(), w72.end(), std::ref(f32rng));
+  std::generate(w73.begin(), w73.end(), std::ref(f32rng));
+  std::generate(w74.begin(), w74.end(), std::ref(f32rng));
+  std::generate(w75.begin(), w75.end(), std::ref(f32rng));
+  std::generate(w76.begin(), w76.end(), std::ref(f32rng));
+  std::generate(w77.begin(), w77.end(), std::ref(f32rng));
+  std::generate(w78.begin(), w78.end(), std::ref(f32rng));
+  std::generate(w79.begin(), w79.end(), std::ref(f32rng));
+  std::generate(w80.begin(), w80.end(), std::ref(f32rng));
+  std::generate(w81.begin(), w81.end(), std::ref(f32rng));
+  std::generate(w82.begin(), w82.end(), std::ref(f32rng));
+  std::generate(w83.begin(), w83.end(), std::ref(f32rng));
+  std::generate(w84.begin(), w84.end(), std::ref(f32rng));
+  std::generate(w85.begin(), w85.end(), std::ref(f32rng));
 
   ExecutionPlan operators;
   xnn_status status;
@@ -208,7 +209,7 @@
     32 /* output_channels_per_group */,
     3 /* input pixel stride */,
     32 /* output pixel stride */,
-    w30, w31,
+    w30.data(), w31.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op0);
@@ -230,7 +231,7 @@
     1 /* output_channels_per_group */,
     32 /* input pixel stride */,
     32 /* output pixel stride */,
-    w32, w33,
+    w32.data(), w33.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op1);
@@ -252,7 +253,7 @@
     64 /* output_channels_per_group */,
     32 /* input pixel stride */,
     64 /* output pixel stride */,
-    w34, w35,
+    w34.data(), w35.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op2);
@@ -274,7 +275,7 @@
     1 /* output_channels_per_group */,
     64 /* input pixel stride */,
     64 /* output pixel stride */,
-    w36, w37,
+    w36.data(), w37.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op3);
@@ -296,7 +297,7 @@
     128 /* output_channels_per_group */,
     64 /* input pixel stride */,
     128 /* output pixel stride */,
-    w38, w39,
+    w38.data(), w39.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op4);
@@ -318,7 +319,7 @@
     1 /* output_channels_per_group */,
     128 /* input pixel stride */,
     128 /* output pixel stride */,
-    w40, w41,
+    w40.data(), w41.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op5);
@@ -340,7 +341,7 @@
     128 /* output_channels_per_group */,
     128 /* input pixel stride */,
     128 /* output pixel stride */,
-    w42, w43,
+    w42.data(), w43.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op6);
@@ -362,7 +363,7 @@
     1 /* output_channels_per_group */,
     128 /* input pixel stride */,
     128 /* output pixel stride */,
-    w44, w45,
+    w44.data(), w45.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op7);
@@ -384,7 +385,7 @@
     256 /* output_channels_per_group */,
     128 /* input pixel stride */,
     256 /* output pixel stride */,
-    w46, w47,
+    w46.data(), w47.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op8);
@@ -406,7 +407,7 @@
     1 /* output_channels_per_group */,
     256 /* input pixel stride */,
     256 /* output pixel stride */,
-    w48, w49,
+    w48.data(), w49.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op9);
@@ -428,7 +429,7 @@
     256 /* output_channels_per_group */,
     256 /* input pixel stride */,
     256 /* output pixel stride */,
-    w50, w51,
+    w50.data(), w51.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op10);
@@ -450,7 +451,7 @@
     1 /* output_channels_per_group */,
     256 /* input pixel stride */,
     256 /* output pixel stride */,
-    w52, w53,
+    w52.data(), w53.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op11);
@@ -472,7 +473,7 @@
     512 /* output_channels_per_group */,
     256 /* input pixel stride */,
     512 /* output pixel stride */,
-    w54, w55,
+    w54.data(), w55.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op12);
@@ -494,7 +495,7 @@
     1 /* output_channels_per_group */,
     512 /* input pixel stride */,
     512 /* output pixel stride */,
-    w56, w57,
+    w56.data(), w57.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op13);
@@ -516,7 +517,7 @@
     512 /* output_channels_per_group */,
     512 /* input pixel stride */,
     512 /* output pixel stride */,
-    w58, w59,
+    w58.data(), w59.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op14);
@@ -538,7 +539,7 @@
     1 /* output_channels_per_group */,
     512 /* input pixel stride */,
     512 /* output pixel stride */,
-    w60, w61,
+    w60.data(), w61.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op15);
@@ -560,7 +561,7 @@
     512 /* output_channels_per_group */,
     512 /* input pixel stride */,
     512 /* output pixel stride */,
-    w62, w63,
+    w62.data(), w63.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op16);
@@ -582,7 +583,7 @@
     1 /* output_channels_per_group */,
     512 /* input pixel stride */,
     512 /* output pixel stride */,
-    w64, w65,
+    w64.data(), w65.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op17);
@@ -604,7 +605,7 @@
     512 /* output_channels_per_group */,
     512 /* input pixel stride */,
     512 /* output pixel stride */,
-    w66, w67,
+    w66.data(), w67.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op18);
@@ -626,7 +627,7 @@
     1 /* output_channels_per_group */,
     512 /* input pixel stride */,
     512 /* output pixel stride */,
-    w68, w69,
+    w68.data(), w69.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op19);
@@ -648,7 +649,7 @@
     512 /* output_channels_per_group */,
     512 /* input pixel stride */,
     512 /* output pixel stride */,
-    w70, w71,
+    w70.data(), w71.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op20);
@@ -670,7 +671,7 @@
     1 /* output_channels_per_group */,
     512 /* input pixel stride */,
     512 /* output pixel stride */,
-    w72, w73,
+    w72.data(), w73.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op21);
@@ -692,7 +693,7 @@
     512 /* output_channels_per_group */,
     512 /* input pixel stride */,
     512 /* output pixel stride */,
-    w74, w75,
+    w74.data(), w75.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op22);
@@ -714,7 +715,7 @@
     1 /* output_channels_per_group */,
     512 /* input pixel stride */,
     512 /* output pixel stride */,
-    w76, w77,
+    w76.data(), w77.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op23);
@@ -736,7 +737,7 @@
     1024 /* output_channels_per_group */,
     512 /* input pixel stride */,
     1024 /* output pixel stride */,
-    w78, w79,
+    w78.data(), w79.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op24);
@@ -758,7 +759,7 @@
     1 /* output_channels_per_group */,
     1024 /* input pixel stride */,
     1024 /* output pixel stride */,
-    w80, w81,
+    w80.data(), w81.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op25);
@@ -780,7 +781,7 @@
     1024 /* output_channels_per_group */,
     1024 /* input pixel stride */,
     1024 /* output pixel stride */,
-    w82, w83,
+    w82.data(), w83.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op26);
@@ -814,7 +815,7 @@
     1001 /* output_channels_per_group */,
     1024 /* input pixel stride */,
     1001 /* output pixel stride */,
-    w84, w85,
+    w84.data(), w85.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op28);
@@ -829,7 +830,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op0,
     1 /* batch size */, 224 /* input height */, 224 /* input width */,
-    v0 /* input */, v1 /* output */,
+    v0.data() /* input */, v1.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #0" << std::endl;
@@ -839,7 +840,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op1,
     1 /* batch size */, 112 /* input height */, 112 /* input width */,
-    v1 /* input */, v2 /* output */,
+    v1.data() /* input */, v2.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #1" << std::endl;
@@ -849,7 +850,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op2,
     1 /* batch size */, 112 /* input height */, 112 /* input width */,
-    v2 /* input */, v3 /* output */,
+    v2.data() /* input */, v3.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #2" << std::endl;
@@ -859,7 +860,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op3,
     1 /* batch size */, 112 /* input height */, 112 /* input width */,
-    v3 /* input */, v4 /* output */,
+    v3.data() /* input */, v4.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #3" << std::endl;
@@ -869,7 +870,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op4,
     1 /* batch size */, 56 /* input height */, 56 /* input width */,
-    v4 /* input */, v5 /* output */,
+    v4.data() /* input */, v5.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #4" << std::endl;
@@ -879,7 +880,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op5,
     1 /* batch size */, 56 /* input height */, 56 /* input width */,
-    v5 /* input */, v6 /* output */,
+    v5.data() /* input */, v6.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #5" << std::endl;
@@ -889,7 +890,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op6,
     1 /* batch size */, 56 /* input height */, 56 /* input width */,
-    v6 /* input */, v7 /* output */,
+    v6.data() /* input */, v7.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #6" << std::endl;
@@ -899,7 +900,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op7,
     1 /* batch size */, 56 /* input height */, 56 /* input width */,
-    v7 /* input */, v8 /* output */,
+    v7.data() /* input */, v8.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #7" << std::endl;
@@ -909,7 +910,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op8,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v8 /* input */, v9 /* output */,
+    v8.data() /* input */, v9.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #8" << std::endl;
@@ -919,7 +920,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op9,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v9 /* input */, v10 /* output */,
+    v9.data() /* input */, v10.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #9" << std::endl;
@@ -929,7 +930,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op10,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v10 /* input */, v11 /* output */,
+    v10.data() /* input */, v11.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #10" << std::endl;
@@ -939,7 +940,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op11,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v11 /* input */, v12 /* output */,
+    v11.data() /* input */, v12.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #11" << std::endl;
@@ -949,7 +950,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op12,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v12 /* input */, v13 /* output */,
+    v12.data() /* input */, v13.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #12" << std::endl;
@@ -959,7 +960,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op13,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v13 /* input */, v14 /* output */,
+    v13.data() /* input */, v14.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #13" << std::endl;
@@ -969,7 +970,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op14,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v14 /* input */, v15 /* output */,
+    v14.data() /* input */, v15.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #14" << std::endl;
@@ -979,7 +980,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op15,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v15 /* input */, v16 /* output */,
+    v15.data() /* input */, v16.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #15" << std::endl;
@@ -989,7 +990,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op16,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v16 /* input */, v17 /* output */,
+    v16.data() /* input */, v17.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #16" << std::endl;
@@ -999,7 +1000,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op17,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v17 /* input */, v18 /* output */,
+    v17.data() /* input */, v18.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #17" << std::endl;
@@ -1009,7 +1010,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op18,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v18 /* input */, v19 /* output */,
+    v18.data() /* input */, v19.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #18" << std::endl;
@@ -1019,7 +1020,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op19,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v19 /* input */, v20 /* output */,
+    v19.data() /* input */, v20.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #19" << std::endl;
@@ -1029,7 +1030,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op20,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v20 /* input */, v21 /* output */,
+    v20.data() /* input */, v21.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #20" << std::endl;
@@ -1039,7 +1040,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op21,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v21 /* input */, v22 /* output */,
+    v21.data() /* input */, v22.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #21" << std::endl;
@@ -1049,7 +1050,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op22,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v22 /* input */, v23 /* output */,
+    v22.data() /* input */, v23.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #22" << std::endl;
@@ -1059,7 +1060,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op23,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v23 /* input */, v24 /* output */,
+    v23.data() /* input */, v24.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #23" << std::endl;
@@ -1069,7 +1070,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op24,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v24 /* input */, v25 /* output */,
+    v24.data() /* input */, v25.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #24" << std::endl;
@@ -1079,7 +1080,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op25,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v25 /* input */, v26 /* output */,
+    v25.data() /* input */, v26.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #25" << std::endl;
@@ -1089,7 +1090,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op26,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v26 /* input */, v27 /* output */,
+    v26.data() /* input */, v27.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #26" << std::endl;
@@ -1099,7 +1100,7 @@
   status = xnn_setup_global_average_pooling_nwc_f32(
     op27,
     1 /* batch size */, 49 /* width */,
-    v27 /* input */, v28 /* output */,
+    v27.data() /* input */, v28.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #27" << std::endl;
@@ -1109,7 +1110,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op28,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v28 /* input */, v29 /* output */,
+    v28.data() /* input */, v29.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #28" << std::endl;
diff --git a/models/fp32-mobilenet-v2.cc b/models/fp32-mobilenet-v2.cc
index 14177a9..66180a5 100644
--- a/models/fp32-mobilenet-v2.cc
+++ b/models/fp32-mobilenet-v2.cc
@@ -5,6 +5,7 @@
 
 #include <xnnpack.h>
 
+#include <array>
 #include <algorithm>
 #include <functional>
 #include <iostream>
@@ -16,352 +17,352 @@
 namespace models {
 
 ExecutionPlan FP32MobileNetV2(pthreadpool_t threadpool) {
-  alignas(16) static float v0[150528];
-  alignas(16) static float v1[401408];
-  alignas(16) static float v2[401408];
-  alignas(16) static float v3[200704];
-  alignas(16) static float v4[1204224];
-  alignas(16) static float v5[301056];
-  alignas(16) static float v6[75264];
-  alignas(16) static float v7[451584];
-  alignas(16) static float v8[451584];
-  alignas(16) static float v9[75264];
-  alignas(16) static float v10[75264];
-  alignas(16) static float v11[451584];
-  alignas(16) static float v12[112896];
-  alignas(16) static float v13[25088];
-  alignas(16) static float v14[150528];
-  alignas(16) static float v15[150528];
-  alignas(16) static float v16[25088];
-  alignas(16) static float v17[25088];
-  alignas(16) static float v18[150528];
-  alignas(16) static float v19[150528];
-  alignas(16) static float v20[25088];
-  alignas(16) static float v21[25088];
-  alignas(16) static float v22[150528];
-  alignas(16) static float v23[37632];
-  alignas(16) static float v24[12544];
-  alignas(16) static float v25[75264];
-  alignas(16) static float v26[75264];
-  alignas(16) static float v27[12544];
-  alignas(16) static float v28[12544];
-  alignas(16) static float v29[75264];
-  alignas(16) static float v30[75264];
-  alignas(16) static float v31[12544];
-  alignas(16) static float v32[12544];
-  alignas(16) static float v33[75264];
-  alignas(16) static float v34[75264];
-  alignas(16) static float v35[12544];
-  alignas(16) static float v36[12544];
-  alignas(16) static float v37[75264];
-  alignas(16) static float v38[75264];
-  alignas(16) static float v39[18816];
-  alignas(16) static float v40[112896];
-  alignas(16) static float v41[112896];
-  alignas(16) static float v42[18816];
-  alignas(16) static float v43[18816];
-  alignas(16) static float v44[112896];
-  alignas(16) static float v45[112896];
-  alignas(16) static float v46[18816];
-  alignas(16) static float v47[18816];
-  alignas(16) static float v48[112896];
-  alignas(16) static float v49[28224];
-  alignas(16) static float v50[7840];
-  alignas(16) static float v51[47040];
-  alignas(16) static float v52[47040];
-  alignas(16) static float v53[7840];
-  alignas(16) static float v54[7840];
-  alignas(16) static float v55[47040];
-  alignas(16) static float v56[47040];
-  alignas(16) static float v57[7840];
-  alignas(16) static float v58[7840];
-  alignas(16) static float v59[47040];
-  alignas(16) static float v60[47040];
-  alignas(16) static float v61[15680];
-  alignas(16) static float v62[62720];
-  alignas(16) static float v63[1280];
-  alignas(16) static float v64[1001];
-  alignas(16) static float w65[864];
-  alignas(16) static float w66[32];
-  alignas(16) static float w67[288];
-  alignas(16) static float w68[32];
-  alignas(16) static float w69[512];
-  alignas(16) static float w70[16];
-  alignas(16) static float w71[1536];
-  alignas(16) static float w72[96];
-  alignas(16) static float w73[864];
-  alignas(16) static float w74[96];
-  alignas(16) static float w75[2304];
-  alignas(16) static float w76[24];
-  alignas(16) static float w77[3456];
-  alignas(16) static float w78[144];
-  alignas(16) static float w79[1296];
-  alignas(16) static float w80[144];
-  alignas(16) static float w81[3456];
-  alignas(16) static float w82[24];
-  alignas(16) static float w83[3456];
-  alignas(16) static float w84[144];
-  alignas(16) static float w85[1296];
-  alignas(16) static float w86[144];
-  alignas(16) static float w87[4608];
-  alignas(16) static float w88[32];
-  alignas(16) static float w89[6144];
-  alignas(16) static float w90[192];
-  alignas(16) static float w91[1728];
-  alignas(16) static float w92[192];
-  alignas(16) static float w93[6144];
-  alignas(16) static float w94[32];
-  alignas(16) static float w95[6144];
-  alignas(16) static float w96[192];
-  alignas(16) static float w97[1728];
-  alignas(16) static float w98[192];
-  alignas(16) static float w99[6144];
-  alignas(16) static float w100[32];
-  alignas(16) static float w101[6144];
-  alignas(16) static float w102[192];
-  alignas(16) static float w103[1728];
-  alignas(16) static float w104[192];
-  alignas(16) static float w105[12288];
-  alignas(16) static float w106[64];
-  alignas(16) static float w107[24576];
-  alignas(16) static float w108[384];
-  alignas(16) static float w109[3456];
-  alignas(16) static float w110[384];
-  alignas(16) static float w111[24576];
-  alignas(16) static float w112[64];
-  alignas(16) static float w113[24576];
-  alignas(16) static float w114[384];
-  alignas(16) static float w115[3456];
-  alignas(16) static float w116[384];
-  alignas(16) static float w117[24576];
-  alignas(16) static float w118[64];
-  alignas(16) static float w119[24576];
-  alignas(16) static float w120[384];
-  alignas(16) static float w121[3456];
-  alignas(16) static float w122[384];
-  alignas(16) static float w123[24576];
-  alignas(16) static float w124[64];
-  alignas(16) static float w125[24576];
-  alignas(16) static float w126[384];
-  alignas(16) static float w127[3456];
-  alignas(16) static float w128[384];
-  alignas(16) static float w129[36864];
-  alignas(16) static float w130[96];
-  alignas(16) static float w131[55296];
-  alignas(16) static float w132[576];
-  alignas(16) static float w133[5184];
-  alignas(16) static float w134[576];
-  alignas(16) static float w135[55296];
-  alignas(16) static float w136[96];
-  alignas(16) static float w137[55296];
-  alignas(16) static float w138[576];
-  alignas(16) static float w139[5184];
-  alignas(16) static float w140[576];
-  alignas(16) static float w141[55296];
-  alignas(16) static float w142[96];
-  alignas(16) static float w143[55296];
-  alignas(16) static float w144[576];
-  alignas(16) static float w145[5184];
-  alignas(16) static float w146[576];
-  alignas(16) static float w147[92160];
-  alignas(16) static float w148[160];
-  alignas(16) static float w149[153600];
-  alignas(16) static float w150[960];
-  alignas(16) static float w151[8640];
-  alignas(16) static float w152[960];
-  alignas(16) static float w153[153600];
-  alignas(16) static float w154[160];
-  alignas(16) static float w155[153600];
-  alignas(16) static float w156[960];
-  alignas(16) static float w157[8640];
-  alignas(16) static float w158[960];
-  alignas(16) static float w159[153600];
-  alignas(16) static float w160[160];
-  alignas(16) static float w161[153600];
-  alignas(16) static float w162[960];
-  alignas(16) static float w163[8640];
-  alignas(16) static float w164[960];
-  alignas(16) static float w165[307200];
-  alignas(16) static float w166[320];
-  alignas(16) static float w167[409600];
-  alignas(16) static float w168[1280];
-  alignas(16) static float w169[1281280];
-  alignas(16) static float w170[1001];
+  alignas(16) static std::array<float, 150528> v0;
+  alignas(16) static std::array<float, 401408> v1;
+  alignas(16) static std::array<float, 401408> v2;
+  alignas(16) static std::array<float, 200704> v3;
+  alignas(16) static std::array<float, 1204224> v4;
+  alignas(16) static std::array<float, 301056> v5;
+  alignas(16) static std::array<float, 75264> v6;
+  alignas(16) static std::array<float, 451584> v7;
+  alignas(16) static std::array<float, 451584> v8;
+  alignas(16) static std::array<float, 75264> v9;
+  alignas(16) static std::array<float, 75264> v10;
+  alignas(16) static std::array<float, 451584> v11;
+  alignas(16) static std::array<float, 112896> v12;
+  alignas(16) static std::array<float, 25088> v13;
+  alignas(16) static std::array<float, 150528> v14;
+  alignas(16) static std::array<float, 150528> v15;
+  alignas(16) static std::array<float, 25088> v16;
+  alignas(16) static std::array<float, 25088> v17;
+  alignas(16) static std::array<float, 150528> v18;
+  alignas(16) static std::array<float, 150528> v19;
+  alignas(16) static std::array<float, 25088> v20;
+  alignas(16) static std::array<float, 25088> v21;
+  alignas(16) static std::array<float, 150528> v22;
+  alignas(16) static std::array<float, 37632> v23;
+  alignas(16) static std::array<float, 12544> v24;
+  alignas(16) static std::array<float, 75264> v25;
+  alignas(16) static std::array<float, 75264> v26;
+  alignas(16) static std::array<float, 12544> v27;
+  alignas(16) static std::array<float, 12544> v28;
+  alignas(16) static std::array<float, 75264> v29;
+  alignas(16) static std::array<float, 75264> v30;
+  alignas(16) static std::array<float, 12544> v31;
+  alignas(16) static std::array<float, 12544> v32;
+  alignas(16) static std::array<float, 75264> v33;
+  alignas(16) static std::array<float, 75264> v34;
+  alignas(16) static std::array<float, 12544> v35;
+  alignas(16) static std::array<float, 12544> v36;
+  alignas(16) static std::array<float, 75264> v37;
+  alignas(16) static std::array<float, 75264> v38;
+  alignas(16) static std::array<float, 18816> v39;
+  alignas(16) static std::array<float, 112896> v40;
+  alignas(16) static std::array<float, 112896> v41;
+  alignas(16) static std::array<float, 18816> v42;
+  alignas(16) static std::array<float, 18816> v43;
+  alignas(16) static std::array<float, 112896> v44;
+  alignas(16) static std::array<float, 112896> v45;
+  alignas(16) static std::array<float, 18816> v46;
+  alignas(16) static std::array<float, 18816> v47;
+  alignas(16) static std::array<float, 112896> v48;
+  alignas(16) static std::array<float, 28224> v49;
+  alignas(16) static std::array<float, 7840> v50;
+  alignas(16) static std::array<float, 47040> v51;
+  alignas(16) static std::array<float, 47040> v52;
+  alignas(16) static std::array<float, 7840> v53;
+  alignas(16) static std::array<float, 7840> v54;
+  alignas(16) static std::array<float, 47040> v55;
+  alignas(16) static std::array<float, 47040> v56;
+  alignas(16) static std::array<float, 7840> v57;
+  alignas(16) static std::array<float, 7840> v58;
+  alignas(16) static std::array<float, 47040> v59;
+  alignas(16) static std::array<float, 47040> v60;
+  alignas(16) static std::array<float, 15680> v61;
+  alignas(16) static std::array<float, 62720> v62;
+  alignas(16) static std::array<float, 1280> v63;
+  alignas(16) static std::array<float, 1001> v64;
+  alignas(16) static std::array<float, 864> w65;
+  alignas(16) static std::array<float, 32> w66;
+  alignas(16) static std::array<float, 288> w67;
+  alignas(16) static std::array<float, 32> w68;
+  alignas(16) static std::array<float, 512> w69;
+  alignas(16) static std::array<float, 16> w70;
+  alignas(16) static std::array<float, 1536> w71;
+  alignas(16) static std::array<float, 96> w72;
+  alignas(16) static std::array<float, 864> w73;
+  alignas(16) static std::array<float, 96> w74;
+  alignas(16) static std::array<float, 2304> w75;
+  alignas(16) static std::array<float, 24> w76;
+  alignas(16) static std::array<float, 3456> w77;
+  alignas(16) static std::array<float, 144> w78;
+  alignas(16) static std::array<float, 1296> w79;
+  alignas(16) static std::array<float, 144> w80;
+  alignas(16) static std::array<float, 3456> w81;
+  alignas(16) static std::array<float, 24> w82;
+  alignas(16) static std::array<float, 3456> w83;
+  alignas(16) static std::array<float, 144> w84;
+  alignas(16) static std::array<float, 1296> w85;
+  alignas(16) static std::array<float, 144> w86;
+  alignas(16) static std::array<float, 4608> w87;
+  alignas(16) static std::array<float, 32> w88;
+  alignas(16) static std::array<float, 6144> w89;
+  alignas(16) static std::array<float, 192> w90;
+  alignas(16) static std::array<float, 1728> w91;
+  alignas(16) static std::array<float, 192> w92;
+  alignas(16) static std::array<float, 6144> w93;
+  alignas(16) static std::array<float, 32> w94;
+  alignas(16) static std::array<float, 6144> w95;
+  alignas(16) static std::array<float, 192> w96;
+  alignas(16) static std::array<float, 1728> w97;
+  alignas(16) static std::array<float, 192> w98;
+  alignas(16) static std::array<float, 6144> w99;
+  alignas(16) static std::array<float, 32> w100;
+  alignas(16) static std::array<float, 6144> w101;
+  alignas(16) static std::array<float, 192> w102;
+  alignas(16) static std::array<float, 1728> w103;
+  alignas(16) static std::array<float, 192> w104;
+  alignas(16) static std::array<float, 12288> w105;
+  alignas(16) static std::array<float, 64> w106;
+  alignas(16) static std::array<float, 24576> w107;
+  alignas(16) static std::array<float, 384> w108;
+  alignas(16) static std::array<float, 3456> w109;
+  alignas(16) static std::array<float, 384> w110;
+  alignas(16) static std::array<float, 24576> w111;
+  alignas(16) static std::array<float, 64> w112;
+  alignas(16) static std::array<float, 24576> w113;
+  alignas(16) static std::array<float, 384> w114;
+  alignas(16) static std::array<float, 3456> w115;
+  alignas(16) static std::array<float, 384> w116;
+  alignas(16) static std::array<float, 24576> w117;
+  alignas(16) static std::array<float, 64> w118;
+  alignas(16) static std::array<float, 24576> w119;
+  alignas(16) static std::array<float, 384> w120;
+  alignas(16) static std::array<float, 3456> w121;
+  alignas(16) static std::array<float, 384> w122;
+  alignas(16) static std::array<float, 24576> w123;
+  alignas(16) static std::array<float, 64> w124;
+  alignas(16) static std::array<float, 24576> w125;
+  alignas(16) static std::array<float, 384> w126;
+  alignas(16) static std::array<float, 3456> w127;
+  alignas(16) static std::array<float, 384> w128;
+  alignas(16) static std::array<float, 36864> w129;
+  alignas(16) static std::array<float, 96> w130;
+  alignas(16) static std::array<float, 55296> w131;
+  alignas(16) static std::array<float, 576> w132;
+  alignas(16) static std::array<float, 5184> w133;
+  alignas(16) static std::array<float, 576> w134;
+  alignas(16) static std::array<float, 55296> w135;
+  alignas(16) static std::array<float, 96> w136;
+  alignas(16) static std::array<float, 55296> w137;
+  alignas(16) static std::array<float, 576> w138;
+  alignas(16) static std::array<float, 5184> w139;
+  alignas(16) static std::array<float, 576> w140;
+  alignas(16) static std::array<float, 55296> w141;
+  alignas(16) static std::array<float, 96> w142;
+  alignas(16) static std::array<float, 55296> w143;
+  alignas(16) static std::array<float, 576> w144;
+  alignas(16) static std::array<float, 5184> w145;
+  alignas(16) static std::array<float, 576> w146;
+  alignas(16) static std::array<float, 92160> w147;
+  alignas(16) static std::array<float, 160> w148;
+  alignas(16) static std::array<float, 153600> w149;
+  alignas(16) static std::array<float, 960> w150;
+  alignas(16) static std::array<float, 8640> w151;
+  alignas(16) static std::array<float, 960> w152;
+  alignas(16) static std::array<float, 153600> w153;
+  alignas(16) static std::array<float, 160> w154;
+  alignas(16) static std::array<float, 153600> w155;
+  alignas(16) static std::array<float, 960> w156;
+  alignas(16) static std::array<float, 8640> w157;
+  alignas(16) static std::array<float, 960> w158;
+  alignas(16) static std::array<float, 153600> w159;
+  alignas(16) static std::array<float, 160> w160;
+  alignas(16) static std::array<float, 153600> w161;
+  alignas(16) static std::array<float, 960> w162;
+  alignas(16) static std::array<float, 8640> w163;
+  alignas(16) static std::array<float, 960> w164;
+  alignas(16) static std::array<float, 307200> w165;
+  alignas(16) static std::array<float, 320> w166;
+  alignas(16) static std::array<float, 409600> w167;
+  alignas(16) static std::array<float, 1280> w168;
+  alignas(16) static std::array<float, 1281280> w169;
+  alignas(16) static std::array<float, 1001> w170;
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.0f, +1.0f), std::ref(rng));
-  std::generate(v0, v0 + 150528, std::ref(f32rng));
-  std::generate(v1, v1 + 401408, std::ref(f32rng));
-  std::generate(v2, v2 + 401408, std::ref(f32rng));
-  std::generate(v3, v3 + 200704, std::ref(f32rng));
-  std::generate(v4, v4 + 1204224, std::ref(f32rng));
-  std::generate(v5, v5 + 301056, std::ref(f32rng));
-  std::generate(v6, v6 + 75264, std::ref(f32rng));
-  std::generate(v7, v7 + 451584, std::ref(f32rng));
-  std::generate(v8, v8 + 451584, std::ref(f32rng));
-  std::generate(v9, v9 + 75264, std::ref(f32rng));
-  std::generate(v10, v10 + 75264, std::ref(f32rng));
-  std::generate(v11, v11 + 451584, std::ref(f32rng));
-  std::generate(v12, v12 + 112896, std::ref(f32rng));
-  std::generate(v13, v13 + 25088, std::ref(f32rng));
-  std::generate(v14, v14 + 150528, std::ref(f32rng));
-  std::generate(v15, v15 + 150528, std::ref(f32rng));
-  std::generate(v16, v16 + 25088, std::ref(f32rng));
-  std::generate(v17, v17 + 25088, std::ref(f32rng));
-  std::generate(v18, v18 + 150528, std::ref(f32rng));
-  std::generate(v19, v19 + 150528, std::ref(f32rng));
-  std::generate(v20, v20 + 25088, std::ref(f32rng));
-  std::generate(v21, v21 + 25088, std::ref(f32rng));
-  std::generate(v22, v22 + 150528, std::ref(f32rng));
-  std::generate(v23, v23 + 37632, std::ref(f32rng));
-  std::generate(v24, v24 + 12544, std::ref(f32rng));
-  std::generate(v25, v25 + 75264, std::ref(f32rng));
-  std::generate(v26, v26 + 75264, std::ref(f32rng));
-  std::generate(v27, v27 + 12544, std::ref(f32rng));
-  std::generate(v28, v28 + 12544, std::ref(f32rng));
-  std::generate(v29, v29 + 75264, std::ref(f32rng));
-  std::generate(v30, v30 + 75264, std::ref(f32rng));
-  std::generate(v31, v31 + 12544, std::ref(f32rng));
-  std::generate(v32, v32 + 12544, std::ref(f32rng));
-  std::generate(v33, v33 + 75264, std::ref(f32rng));
-  std::generate(v34, v34 + 75264, std::ref(f32rng));
-  std::generate(v35, v35 + 12544, std::ref(f32rng));
-  std::generate(v36, v36 + 12544, std::ref(f32rng));
-  std::generate(v37, v37 + 75264, std::ref(f32rng));
-  std::generate(v38, v38 + 75264, std::ref(f32rng));
-  std::generate(v39, v39 + 18816, std::ref(f32rng));
-  std::generate(v40, v40 + 112896, std::ref(f32rng));
-  std::generate(v41, v41 + 112896, std::ref(f32rng));
-  std::generate(v42, v42 + 18816, std::ref(f32rng));
-  std::generate(v43, v43 + 18816, std::ref(f32rng));
-  std::generate(v44, v44 + 112896, std::ref(f32rng));
-  std::generate(v45, v45 + 112896, std::ref(f32rng));
-  std::generate(v46, v46 + 18816, std::ref(f32rng));
-  std::generate(v47, v47 + 18816, std::ref(f32rng));
-  std::generate(v48, v48 + 112896, std::ref(f32rng));
-  std::generate(v49, v49 + 28224, std::ref(f32rng));
-  std::generate(v50, v50 + 7840, std::ref(f32rng));
-  std::generate(v51, v51 + 47040, std::ref(f32rng));
-  std::generate(v52, v52 + 47040, std::ref(f32rng));
-  std::generate(v53, v53 + 7840, std::ref(f32rng));
-  std::generate(v54, v54 + 7840, std::ref(f32rng));
-  std::generate(v55, v55 + 47040, std::ref(f32rng));
-  std::generate(v56, v56 + 47040, std::ref(f32rng));
-  std::generate(v57, v57 + 7840, std::ref(f32rng));
-  std::generate(v58, v58 + 7840, std::ref(f32rng));
-  std::generate(v59, v59 + 47040, std::ref(f32rng));
-  std::generate(v60, v60 + 47040, std::ref(f32rng));
-  std::generate(v61, v61 + 15680, std::ref(f32rng));
-  std::generate(v62, v62 + 62720, std::ref(f32rng));
-  std::generate(v63, v63 + 1280, std::ref(f32rng));
-  std::generate(v64, v64 + 1001, std::ref(f32rng));
-  std::generate(w65, w65 + 864, std::ref(f32rng));
-  std::generate(w66, w66 + 32, std::ref(f32rng));
-  std::generate(w67, w67 + 288, std::ref(f32rng));
-  std::generate(w68, w68 + 32, std::ref(f32rng));
-  std::generate(w69, w69 + 512, std::ref(f32rng));
-  std::generate(w70, w70 + 16, std::ref(f32rng));
-  std::generate(w71, w71 + 1536, std::ref(f32rng));
-  std::generate(w72, w72 + 96, std::ref(f32rng));
-  std::generate(w73, w73 + 864, std::ref(f32rng));
-  std::generate(w74, w74 + 96, std::ref(f32rng));
-  std::generate(w75, w75 + 2304, std::ref(f32rng));
-  std::generate(w76, w76 + 24, std::ref(f32rng));
-  std::generate(w77, w77 + 3456, std::ref(f32rng));
-  std::generate(w78, w78 + 144, std::ref(f32rng));
-  std::generate(w79, w79 + 1296, std::ref(f32rng));
-  std::generate(w80, w80 + 144, std::ref(f32rng));
-  std::generate(w81, w81 + 3456, std::ref(f32rng));
-  std::generate(w82, w82 + 24, std::ref(f32rng));
-  std::generate(w83, w83 + 3456, std::ref(f32rng));
-  std::generate(w84, w84 + 144, std::ref(f32rng));
-  std::generate(w85, w85 + 1296, std::ref(f32rng));
-  std::generate(w86, w86 + 144, std::ref(f32rng));
-  std::generate(w87, w87 + 4608, std::ref(f32rng));
-  std::generate(w88, w88 + 32, std::ref(f32rng));
-  std::generate(w89, w89 + 6144, std::ref(f32rng));
-  std::generate(w90, w90 + 192, std::ref(f32rng));
-  std::generate(w91, w91 + 1728, std::ref(f32rng));
-  std::generate(w92, w92 + 192, std::ref(f32rng));
-  std::generate(w93, w93 + 6144, std::ref(f32rng));
-  std::generate(w94, w94 + 32, std::ref(f32rng));
-  std::generate(w95, w95 + 6144, std::ref(f32rng));
-  std::generate(w96, w96 + 192, std::ref(f32rng));
-  std::generate(w97, w97 + 1728, std::ref(f32rng));
-  std::generate(w98, w98 + 192, std::ref(f32rng));
-  std::generate(w99, w99 + 6144, std::ref(f32rng));
-  std::generate(w100, w100 + 32, std::ref(f32rng));
-  std::generate(w101, w101 + 6144, std::ref(f32rng));
-  std::generate(w102, w102 + 192, std::ref(f32rng));
-  std::generate(w103, w103 + 1728, std::ref(f32rng));
-  std::generate(w104, w104 + 192, std::ref(f32rng));
-  std::generate(w105, w105 + 12288, std::ref(f32rng));
-  std::generate(w106, w106 + 64, std::ref(f32rng));
-  std::generate(w107, w107 + 24576, std::ref(f32rng));
-  std::generate(w108, w108 + 384, std::ref(f32rng));
-  std::generate(w109, w109 + 3456, std::ref(f32rng));
-  std::generate(w110, w110 + 384, std::ref(f32rng));
-  std::generate(w111, w111 + 24576, std::ref(f32rng));
-  std::generate(w112, w112 + 64, std::ref(f32rng));
-  std::generate(w113, w113 + 24576, std::ref(f32rng));
-  std::generate(w114, w114 + 384, std::ref(f32rng));
-  std::generate(w115, w115 + 3456, std::ref(f32rng));
-  std::generate(w116, w116 + 384, std::ref(f32rng));
-  std::generate(w117, w117 + 24576, std::ref(f32rng));
-  std::generate(w118, w118 + 64, std::ref(f32rng));
-  std::generate(w119, w119 + 24576, std::ref(f32rng));
-  std::generate(w120, w120 + 384, std::ref(f32rng));
-  std::generate(w121, w121 + 3456, std::ref(f32rng));
-  std::generate(w122, w122 + 384, std::ref(f32rng));
-  std::generate(w123, w123 + 24576, std::ref(f32rng));
-  std::generate(w124, w124 + 64, std::ref(f32rng));
-  std::generate(w125, w125 + 24576, std::ref(f32rng));
-  std::generate(w126, w126 + 384, std::ref(f32rng));
-  std::generate(w127, w127 + 3456, std::ref(f32rng));
-  std::generate(w128, w128 + 384, std::ref(f32rng));
-  std::generate(w129, w129 + 36864, std::ref(f32rng));
-  std::generate(w130, w130 + 96, std::ref(f32rng));
-  std::generate(w131, w131 + 55296, std::ref(f32rng));
-  std::generate(w132, w132 + 576, std::ref(f32rng));
-  std::generate(w133, w133 + 5184, std::ref(f32rng));
-  std::generate(w134, w134 + 576, std::ref(f32rng));
-  std::generate(w135, w135 + 55296, std::ref(f32rng));
-  std::generate(w136, w136 + 96, std::ref(f32rng));
-  std::generate(w137, w137 + 55296, std::ref(f32rng));
-  std::generate(w138, w138 + 576, std::ref(f32rng));
-  std::generate(w139, w139 + 5184, std::ref(f32rng));
-  std::generate(w140, w140 + 576, std::ref(f32rng));
-  std::generate(w141, w141 + 55296, std::ref(f32rng));
-  std::generate(w142, w142 + 96, std::ref(f32rng));
-  std::generate(w143, w143 + 55296, std::ref(f32rng));
-  std::generate(w144, w144 + 576, std::ref(f32rng));
-  std::generate(w145, w145 + 5184, std::ref(f32rng));
-  std::generate(w146, w146 + 576, std::ref(f32rng));
-  std::generate(w147, w147 + 92160, std::ref(f32rng));
-  std::generate(w148, w148 + 160, std::ref(f32rng));
-  std::generate(w149, w149 + 153600, std::ref(f32rng));
-  std::generate(w150, w150 + 960, std::ref(f32rng));
-  std::generate(w151, w151 + 8640, std::ref(f32rng));
-  std::generate(w152, w152 + 960, std::ref(f32rng));
-  std::generate(w153, w153 + 153600, std::ref(f32rng));
-  std::generate(w154, w154 + 160, std::ref(f32rng));
-  std::generate(w155, w155 + 153600, std::ref(f32rng));
-  std::generate(w156, w156 + 960, std::ref(f32rng));
-  std::generate(w157, w157 + 8640, std::ref(f32rng));
-  std::generate(w158, w158 + 960, std::ref(f32rng));
-  std::generate(w159, w159 + 153600, std::ref(f32rng));
-  std::generate(w160, w160 + 160, std::ref(f32rng));
-  std::generate(w161, w161 + 153600, std::ref(f32rng));
-  std::generate(w162, w162 + 960, std::ref(f32rng));
-  std::generate(w163, w163 + 8640, std::ref(f32rng));
-  std::generate(w164, w164 + 960, std::ref(f32rng));
-  std::generate(w165, w165 + 307200, std::ref(f32rng));
-  std::generate(w166, w166 + 320, std::ref(f32rng));
-  std::generate(w167, w167 + 409600, std::ref(f32rng));
-  std::generate(w168, w168 + 1280, std::ref(f32rng));
-  std::generate(w169, w169 + 1281280, std::ref(f32rng));
-  std::generate(w170, w170 + 1001, std::ref(f32rng));
+  std::generate(v0.begin(), v0.end(), std::ref(f32rng));
+  std::generate(v1.begin(), v1.end(), std::ref(f32rng));
+  std::generate(v2.begin(), v2.end(), std::ref(f32rng));
+  std::generate(v3.begin(), v3.end(), std::ref(f32rng));
+  std::generate(v4.begin(), v4.end(), std::ref(f32rng));
+  std::generate(v5.begin(), v5.end(), std::ref(f32rng));
+  std::generate(v6.begin(), v6.end(), std::ref(f32rng));
+  std::generate(v7.begin(), v7.end(), std::ref(f32rng));
+  std::generate(v8.begin(), v8.end(), std::ref(f32rng));
+  std::generate(v9.begin(), v9.end(), std::ref(f32rng));
+  std::generate(v10.begin(), v10.end(), std::ref(f32rng));
+  std::generate(v11.begin(), v11.end(), std::ref(f32rng));
+  std::generate(v12.begin(), v12.end(), std::ref(f32rng));
+  std::generate(v13.begin(), v13.end(), std::ref(f32rng));
+  std::generate(v14.begin(), v14.end(), std::ref(f32rng));
+  std::generate(v15.begin(), v15.end(), std::ref(f32rng));
+  std::generate(v16.begin(), v16.end(), std::ref(f32rng));
+  std::generate(v17.begin(), v17.end(), std::ref(f32rng));
+  std::generate(v18.begin(), v18.end(), std::ref(f32rng));
+  std::generate(v19.begin(), v19.end(), std::ref(f32rng));
+  std::generate(v20.begin(), v20.end(), std::ref(f32rng));
+  std::generate(v21.begin(), v21.end(), std::ref(f32rng));
+  std::generate(v22.begin(), v22.end(), std::ref(f32rng));
+  std::generate(v23.begin(), v23.end(), std::ref(f32rng));
+  std::generate(v24.begin(), v24.end(), std::ref(f32rng));
+  std::generate(v25.begin(), v25.end(), std::ref(f32rng));
+  std::generate(v26.begin(), v26.end(), std::ref(f32rng));
+  std::generate(v27.begin(), v27.end(), std::ref(f32rng));
+  std::generate(v28.begin(), v28.end(), std::ref(f32rng));
+  std::generate(v29.begin(), v29.end(), std::ref(f32rng));
+  std::generate(v30.begin(), v30.end(), std::ref(f32rng));
+  std::generate(v31.begin(), v31.end(), std::ref(f32rng));
+  std::generate(v32.begin(), v32.end(), std::ref(f32rng));
+  std::generate(v33.begin(), v33.end(), std::ref(f32rng));
+  std::generate(v34.begin(), v34.end(), std::ref(f32rng));
+  std::generate(v35.begin(), v35.end(), std::ref(f32rng));
+  std::generate(v36.begin(), v36.end(), std::ref(f32rng));
+  std::generate(v37.begin(), v37.end(), std::ref(f32rng));
+  std::generate(v38.begin(), v38.end(), std::ref(f32rng));
+  std::generate(v39.begin(), v39.end(), std::ref(f32rng));
+  std::generate(v40.begin(), v40.end(), std::ref(f32rng));
+  std::generate(v41.begin(), v41.end(), std::ref(f32rng));
+  std::generate(v42.begin(), v42.end(), std::ref(f32rng));
+  std::generate(v43.begin(), v43.end(), std::ref(f32rng));
+  std::generate(v44.begin(), v44.end(), std::ref(f32rng));
+  std::generate(v45.begin(), v45.end(), std::ref(f32rng));
+  std::generate(v46.begin(), v46.end(), std::ref(f32rng));
+  std::generate(v47.begin(), v47.end(), std::ref(f32rng));
+  std::generate(v48.begin(), v48.end(), std::ref(f32rng));
+  std::generate(v49.begin(), v49.end(), std::ref(f32rng));
+  std::generate(v50.begin(), v50.end(), std::ref(f32rng));
+  std::generate(v51.begin(), v51.end(), std::ref(f32rng));
+  std::generate(v52.begin(), v52.end(), std::ref(f32rng));
+  std::generate(v53.begin(), v53.end(), std::ref(f32rng));
+  std::generate(v54.begin(), v54.end(), std::ref(f32rng));
+  std::generate(v55.begin(), v55.end(), std::ref(f32rng));
+  std::generate(v56.begin(), v56.end(), std::ref(f32rng));
+  std::generate(v57.begin(), v57.end(), std::ref(f32rng));
+  std::generate(v58.begin(), v58.end(), std::ref(f32rng));
+  std::generate(v59.begin(), v59.end(), std::ref(f32rng));
+  std::generate(v60.begin(), v60.end(), std::ref(f32rng));
+  std::generate(v61.begin(), v61.end(), std::ref(f32rng));
+  std::generate(v62.begin(), v62.end(), std::ref(f32rng));
+  std::generate(v63.begin(), v63.end(), std::ref(f32rng));
+  std::generate(v64.begin(), v64.end(), std::ref(f32rng));
+  std::generate(w65.begin(), w65.end(), std::ref(f32rng));
+  std::generate(w66.begin(), w66.end(), std::ref(f32rng));
+  std::generate(w67.begin(), w67.end(), std::ref(f32rng));
+  std::generate(w68.begin(), w68.end(), std::ref(f32rng));
+  std::generate(w69.begin(), w69.end(), std::ref(f32rng));
+  std::generate(w70.begin(), w70.end(), std::ref(f32rng));
+  std::generate(w71.begin(), w71.end(), std::ref(f32rng));
+  std::generate(w72.begin(), w72.end(), std::ref(f32rng));
+  std::generate(w73.begin(), w73.end(), std::ref(f32rng));
+  std::generate(w74.begin(), w74.end(), std::ref(f32rng));
+  std::generate(w75.begin(), w75.end(), std::ref(f32rng));
+  std::generate(w76.begin(), w76.end(), std::ref(f32rng));
+  std::generate(w77.begin(), w77.end(), std::ref(f32rng));
+  std::generate(w78.begin(), w78.end(), std::ref(f32rng));
+  std::generate(w79.begin(), w79.end(), std::ref(f32rng));
+  std::generate(w80.begin(), w80.end(), std::ref(f32rng));
+  std::generate(w81.begin(), w81.end(), std::ref(f32rng));
+  std::generate(w82.begin(), w82.end(), std::ref(f32rng));
+  std::generate(w83.begin(), w83.end(), std::ref(f32rng));
+  std::generate(w84.begin(), w84.end(), std::ref(f32rng));
+  std::generate(w85.begin(), w85.end(), std::ref(f32rng));
+  std::generate(w86.begin(), w86.end(), std::ref(f32rng));
+  std::generate(w87.begin(), w87.end(), std::ref(f32rng));
+  std::generate(w88.begin(), w88.end(), std::ref(f32rng));
+  std::generate(w89.begin(), w89.end(), std::ref(f32rng));
+  std::generate(w90.begin(), w90.end(), std::ref(f32rng));
+  std::generate(w91.begin(), w91.end(), std::ref(f32rng));
+  std::generate(w92.begin(), w92.end(), std::ref(f32rng));
+  std::generate(w93.begin(), w93.end(), std::ref(f32rng));
+  std::generate(w94.begin(), w94.end(), std::ref(f32rng));
+  std::generate(w95.begin(), w95.end(), std::ref(f32rng));
+  std::generate(w96.begin(), w96.end(), std::ref(f32rng));
+  std::generate(w97.begin(), w97.end(), std::ref(f32rng));
+  std::generate(w98.begin(), w98.end(), std::ref(f32rng));
+  std::generate(w99.begin(), w99.end(), std::ref(f32rng));
+  std::generate(w100.begin(), w100.end(), std::ref(f32rng));
+  std::generate(w101.begin(), w101.end(), std::ref(f32rng));
+  std::generate(w102.begin(), w102.end(), std::ref(f32rng));
+  std::generate(w103.begin(), w103.end(), std::ref(f32rng));
+  std::generate(w104.begin(), w104.end(), std::ref(f32rng));
+  std::generate(w105.begin(), w105.end(), std::ref(f32rng));
+  std::generate(w106.begin(), w106.end(), std::ref(f32rng));
+  std::generate(w107.begin(), w107.end(), std::ref(f32rng));
+  std::generate(w108.begin(), w108.end(), std::ref(f32rng));
+  std::generate(w109.begin(), w109.end(), std::ref(f32rng));
+  std::generate(w110.begin(), w110.end(), std::ref(f32rng));
+  std::generate(w111.begin(), w111.end(), std::ref(f32rng));
+  std::generate(w112.begin(), w112.end(), std::ref(f32rng));
+  std::generate(w113.begin(), w113.end(), std::ref(f32rng));
+  std::generate(w114.begin(), w114.end(), std::ref(f32rng));
+  std::generate(w115.begin(), w115.end(), std::ref(f32rng));
+  std::generate(w116.begin(), w116.end(), std::ref(f32rng));
+  std::generate(w117.begin(), w117.end(), std::ref(f32rng));
+  std::generate(w118.begin(), w118.end(), std::ref(f32rng));
+  std::generate(w119.begin(), w119.end(), std::ref(f32rng));
+  std::generate(w120.begin(), w120.end(), std::ref(f32rng));
+  std::generate(w121.begin(), w121.end(), std::ref(f32rng));
+  std::generate(w122.begin(), w122.end(), std::ref(f32rng));
+  std::generate(w123.begin(), w123.end(), std::ref(f32rng));
+  std::generate(w124.begin(), w124.end(), std::ref(f32rng));
+  std::generate(w125.begin(), w125.end(), std::ref(f32rng));
+  std::generate(w126.begin(), w126.end(), std::ref(f32rng));
+  std::generate(w127.begin(), w127.end(), std::ref(f32rng));
+  std::generate(w128.begin(), w128.end(), std::ref(f32rng));
+  std::generate(w129.begin(), w129.end(), std::ref(f32rng));
+  std::generate(w130.begin(), w130.end(), std::ref(f32rng));
+  std::generate(w131.begin(), w131.end(), std::ref(f32rng));
+  std::generate(w132.begin(), w132.end(), std::ref(f32rng));
+  std::generate(w133.begin(), w133.end(), std::ref(f32rng));
+  std::generate(w134.begin(), w134.end(), std::ref(f32rng));
+  std::generate(w135.begin(), w135.end(), std::ref(f32rng));
+  std::generate(w136.begin(), w136.end(), std::ref(f32rng));
+  std::generate(w137.begin(), w137.end(), std::ref(f32rng));
+  std::generate(w138.begin(), w138.end(), std::ref(f32rng));
+  std::generate(w139.begin(), w139.end(), std::ref(f32rng));
+  std::generate(w140.begin(), w140.end(), std::ref(f32rng));
+  std::generate(w141.begin(), w141.end(), std::ref(f32rng));
+  std::generate(w142.begin(), w142.end(), std::ref(f32rng));
+  std::generate(w143.begin(), w143.end(), std::ref(f32rng));
+  std::generate(w144.begin(), w144.end(), std::ref(f32rng));
+  std::generate(w145.begin(), w145.end(), std::ref(f32rng));
+  std::generate(w146.begin(), w146.end(), std::ref(f32rng));
+  std::generate(w147.begin(), w147.end(), std::ref(f32rng));
+  std::generate(w148.begin(), w148.end(), std::ref(f32rng));
+  std::generate(w149.begin(), w149.end(), std::ref(f32rng));
+  std::generate(w150.begin(), w150.end(), std::ref(f32rng));
+  std::generate(w151.begin(), w151.end(), std::ref(f32rng));
+  std::generate(w152.begin(), w152.end(), std::ref(f32rng));
+  std::generate(w153.begin(), w153.end(), std::ref(f32rng));
+  std::generate(w154.begin(), w154.end(), std::ref(f32rng));
+  std::generate(w155.begin(), w155.end(), std::ref(f32rng));
+  std::generate(w156.begin(), w156.end(), std::ref(f32rng));
+  std::generate(w157.begin(), w157.end(), std::ref(f32rng));
+  std::generate(w158.begin(), w158.end(), std::ref(f32rng));
+  std::generate(w159.begin(), w159.end(), std::ref(f32rng));
+  std::generate(w160.begin(), w160.end(), std::ref(f32rng));
+  std::generate(w161.begin(), w161.end(), std::ref(f32rng));
+  std::generate(w162.begin(), w162.end(), std::ref(f32rng));
+  std::generate(w163.begin(), w163.end(), std::ref(f32rng));
+  std::generate(w164.begin(), w164.end(), std::ref(f32rng));
+  std::generate(w165.begin(), w165.end(), std::ref(f32rng));
+  std::generate(w166.begin(), w166.end(), std::ref(f32rng));
+  std::generate(w167.begin(), w167.end(), std::ref(f32rng));
+  std::generate(w168.begin(), w168.end(), std::ref(f32rng));
+  std::generate(w169.begin(), w169.end(), std::ref(f32rng));
+  std::generate(w170.begin(), w170.end(), std::ref(f32rng));
 
   ExecutionPlan operators;
   xnn_status status;
@@ -378,7 +379,7 @@
     32 /* output_channels_per_group */,
     3 /* input pixel stride */,
     32 /* output pixel stride */,
-    w65, w66,
+    w65.data(), w66.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op0);
@@ -400,7 +401,7 @@
     1 /* output_channels_per_group */,
     32 /* input pixel stride */,
     32 /* output pixel stride */,
-    w67, w68,
+    w67.data(), w68.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op1);
@@ -422,7 +423,7 @@
     16 /* output_channels_per_group */,
     32 /* input pixel stride */,
     16 /* output pixel stride */,
-    w69, w70,
+    w69.data(), w70.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op2);
@@ -444,7 +445,7 @@
     96 /* output_channels_per_group */,
     16 /* input pixel stride */,
     96 /* output pixel stride */,
-    w71, w72,
+    w71.data(), w72.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op3);
@@ -466,7 +467,7 @@
     1 /* output_channels_per_group */,
     96 /* input pixel stride */,
     96 /* output pixel stride */,
-    w73, w74,
+    w73.data(), w74.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op4);
@@ -488,7 +489,7 @@
     24 /* output_channels_per_group */,
     96 /* input pixel stride */,
     24 /* output pixel stride */,
-    w75, w76,
+    w75.data(), w76.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op5);
@@ -510,7 +511,7 @@
     144 /* output_channels_per_group */,
     24 /* input pixel stride */,
     144 /* output pixel stride */,
-    w77, w78,
+    w77.data(), w78.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op6);
@@ -532,7 +533,7 @@
     1 /* output_channels_per_group */,
     144 /* input pixel stride */,
     144 /* output pixel stride */,
-    w79, w80,
+    w79.data(), w80.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op7);
@@ -554,7 +555,7 @@
     24 /* output_channels_per_group */,
     144 /* input pixel stride */,
     24 /* output pixel stride */,
-    w81, w82,
+    w81.data(), w82.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op8);
@@ -587,7 +588,7 @@
     144 /* output_channels_per_group */,
     24 /* input pixel stride */,
     144 /* output pixel stride */,
-    w83, w84,
+    w83.data(), w84.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op10);
@@ -609,7 +610,7 @@
     1 /* output_channels_per_group */,
     144 /* input pixel stride */,
     144 /* output pixel stride */,
-    w85, w86,
+    w85.data(), w86.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op11);
@@ -631,7 +632,7 @@
     32 /* output_channels_per_group */,
     144 /* input pixel stride */,
     32 /* output pixel stride */,
-    w87, w88,
+    w87.data(), w88.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op12);
@@ -653,7 +654,7 @@
     192 /* output_channels_per_group */,
     32 /* input pixel stride */,
     192 /* output pixel stride */,
-    w89, w90,
+    w89.data(), w90.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op13);
@@ -675,7 +676,7 @@
     1 /* output_channels_per_group */,
     192 /* input pixel stride */,
     192 /* output pixel stride */,
-    w91, w92,
+    w91.data(), w92.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op14);
@@ -697,7 +698,7 @@
     32 /* output_channels_per_group */,
     192 /* input pixel stride */,
     32 /* output pixel stride */,
-    w93, w94,
+    w93.data(), w94.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op15);
@@ -730,7 +731,7 @@
     192 /* output_channels_per_group */,
     32 /* input pixel stride */,
     192 /* output pixel stride */,
-    w95, w96,
+    w95.data(), w96.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op17);
@@ -752,7 +753,7 @@
     1 /* output_channels_per_group */,
     192 /* input pixel stride */,
     192 /* output pixel stride */,
-    w97, w98,
+    w97.data(), w98.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op18);
@@ -774,7 +775,7 @@
     32 /* output_channels_per_group */,
     192 /* input pixel stride */,
     32 /* output pixel stride */,
-    w99, w100,
+    w99.data(), w100.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op19);
@@ -807,7 +808,7 @@
     192 /* output_channels_per_group */,
     32 /* input pixel stride */,
     192 /* output pixel stride */,
-    w101, w102,
+    w101.data(), w102.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op21);
@@ -829,7 +830,7 @@
     1 /* output_channels_per_group */,
     192 /* input pixel stride */,
     192 /* output pixel stride */,
-    w103, w104,
+    w103.data(), w104.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op22);
@@ -851,7 +852,7 @@
     64 /* output_channels_per_group */,
     192 /* input pixel stride */,
     64 /* output pixel stride */,
-    w105, w106,
+    w105.data(), w106.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op23);
@@ -873,7 +874,7 @@
     384 /* output_channels_per_group */,
     64 /* input pixel stride */,
     384 /* output pixel stride */,
-    w107, w108,
+    w107.data(), w108.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op24);
@@ -895,7 +896,7 @@
     1 /* output_channels_per_group */,
     384 /* input pixel stride */,
     384 /* output pixel stride */,
-    w109, w110,
+    w109.data(), w110.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op25);
@@ -917,7 +918,7 @@
     64 /* output_channels_per_group */,
     384 /* input pixel stride */,
     64 /* output pixel stride */,
-    w111, w112,
+    w111.data(), w112.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op26);
@@ -950,7 +951,7 @@
     384 /* output_channels_per_group */,
     64 /* input pixel stride */,
     384 /* output pixel stride */,
-    w113, w114,
+    w113.data(), w114.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op28);
@@ -972,7 +973,7 @@
     1 /* output_channels_per_group */,
     384 /* input pixel stride */,
     384 /* output pixel stride */,
-    w115, w116,
+    w115.data(), w116.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op29);
@@ -994,7 +995,7 @@
     64 /* output_channels_per_group */,
     384 /* input pixel stride */,
     64 /* output pixel stride */,
-    w117, w118,
+    w117.data(), w118.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op30);
@@ -1027,7 +1028,7 @@
     384 /* output_channels_per_group */,
     64 /* input pixel stride */,
     384 /* output pixel stride */,
-    w119, w120,
+    w119.data(), w120.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op32);
@@ -1049,7 +1050,7 @@
     1 /* output_channels_per_group */,
     384 /* input pixel stride */,
     384 /* output pixel stride */,
-    w121, w122,
+    w121.data(), w122.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op33);
@@ -1071,7 +1072,7 @@
     64 /* output_channels_per_group */,
     384 /* input pixel stride */,
     64 /* output pixel stride */,
-    w123, w124,
+    w123.data(), w124.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op34);
@@ -1104,7 +1105,7 @@
     384 /* output_channels_per_group */,
     64 /* input pixel stride */,
     384 /* output pixel stride */,
-    w125, w126,
+    w125.data(), w126.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op36);
@@ -1126,7 +1127,7 @@
     1 /* output_channels_per_group */,
     384 /* input pixel stride */,
     384 /* output pixel stride */,
-    w127, w128,
+    w127.data(), w128.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op37);
@@ -1148,7 +1149,7 @@
     96 /* output_channels_per_group */,
     384 /* input pixel stride */,
     96 /* output pixel stride */,
-    w129, w130,
+    w129.data(), w130.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op38);
@@ -1170,7 +1171,7 @@
     576 /* output_channels_per_group */,
     96 /* input pixel stride */,
     576 /* output pixel stride */,
-    w131, w132,
+    w131.data(), w132.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op39);
@@ -1192,7 +1193,7 @@
     1 /* output_channels_per_group */,
     576 /* input pixel stride */,
     576 /* output pixel stride */,
-    w133, w134,
+    w133.data(), w134.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op40);
@@ -1214,7 +1215,7 @@
     96 /* output_channels_per_group */,
     576 /* input pixel stride */,
     96 /* output pixel stride */,
-    w135, w136,
+    w135.data(), w136.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op41);
@@ -1247,7 +1248,7 @@
     576 /* output_channels_per_group */,
     96 /* input pixel stride */,
     576 /* output pixel stride */,
-    w137, w138,
+    w137.data(), w138.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op43);
@@ -1269,7 +1270,7 @@
     1 /* output_channels_per_group */,
     576 /* input pixel stride */,
     576 /* output pixel stride */,
-    w139, w140,
+    w139.data(), w140.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op44);
@@ -1291,7 +1292,7 @@
     96 /* output_channels_per_group */,
     576 /* input pixel stride */,
     96 /* output pixel stride */,
-    w141, w142,
+    w141.data(), w142.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op45);
@@ -1324,7 +1325,7 @@
     576 /* output_channels_per_group */,
     96 /* input pixel stride */,
     576 /* output pixel stride */,
-    w143, w144,
+    w143.data(), w144.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op47);
@@ -1346,7 +1347,7 @@
     1 /* output_channels_per_group */,
     576 /* input pixel stride */,
     576 /* output pixel stride */,
-    w145, w146,
+    w145.data(), w146.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op48);
@@ -1368,7 +1369,7 @@
     160 /* output_channels_per_group */,
     576 /* input pixel stride */,
     160 /* output pixel stride */,
-    w147, w148,
+    w147.data(), w148.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op49);
@@ -1390,7 +1391,7 @@
     960 /* output_channels_per_group */,
     160 /* input pixel stride */,
     960 /* output pixel stride */,
-    w149, w150,
+    w149.data(), w150.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op50);
@@ -1412,7 +1413,7 @@
     1 /* output_channels_per_group */,
     960 /* input pixel stride */,
     960 /* output pixel stride */,
-    w151, w152,
+    w151.data(), w152.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op51);
@@ -1434,7 +1435,7 @@
     160 /* output_channels_per_group */,
     960 /* input pixel stride */,
     160 /* output pixel stride */,
-    w153, w154,
+    w153.data(), w154.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op52);
@@ -1467,7 +1468,7 @@
     960 /* output_channels_per_group */,
     160 /* input pixel stride */,
     960 /* output pixel stride */,
-    w155, w156,
+    w155.data(), w156.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op54);
@@ -1489,7 +1490,7 @@
     1 /* output_channels_per_group */,
     960 /* input pixel stride */,
     960 /* output pixel stride */,
-    w157, w158,
+    w157.data(), w158.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op55);
@@ -1511,7 +1512,7 @@
     160 /* output_channels_per_group */,
     960 /* input pixel stride */,
     160 /* output pixel stride */,
-    w159, w160,
+    w159.data(), w160.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op56);
@@ -1544,7 +1545,7 @@
     960 /* output_channels_per_group */,
     160 /* input pixel stride */,
     960 /* output pixel stride */,
-    w161, w162,
+    w161.data(), w162.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op58);
@@ -1566,7 +1567,7 @@
     1 /* output_channels_per_group */,
     960 /* input pixel stride */,
     960 /* output pixel stride */,
-    w163, w164,
+    w163.data(), w164.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op59);
@@ -1588,7 +1589,7 @@
     320 /* output_channels_per_group */,
     960 /* input pixel stride */,
     320 /* output pixel stride */,
-    w165, w166,
+    w165.data(), w166.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op60);
@@ -1610,7 +1611,7 @@
     1280 /* output_channels_per_group */,
     320 /* input pixel stride */,
     1280 /* output pixel stride */,
-    w167, w168,
+    w167.data(), w168.data(),
     0.0f /* output min */, 6.0f /* output max */,
     0 /* flags */,
     &op61);
@@ -1644,7 +1645,7 @@
     1001 /* output_channels_per_group */,
     1280 /* input pixel stride */,
     1001 /* output pixel stride */,
-    w169, w170,
+    w169.data(), w170.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op63);
@@ -1659,7 +1660,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op0,
     1 /* batch size */, 224 /* input height */, 224 /* input width */,
-    v0 /* input */, v1 /* output */,
+    v0.data() /* input */, v1.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #0" << std::endl;
@@ -1669,7 +1670,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op1,
     1 /* batch size */, 112 /* input height */, 112 /* input width */,
-    v1 /* input */, v2 /* output */,
+    v1.data() /* input */, v2.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #1" << std::endl;
@@ -1679,7 +1680,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op2,
     1 /* batch size */, 112 /* input height */, 112 /* input width */,
-    v2 /* input */, v3 /* output */,
+    v2.data() /* input */, v3.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #2" << std::endl;
@@ -1689,7 +1690,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op3,
     1 /* batch size */, 112 /* input height */, 112 /* input width */,
-    v3 /* input */, v4 /* output */,
+    v3.data() /* input */, v4.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #3" << std::endl;
@@ -1699,7 +1700,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op4,
     1 /* batch size */, 112 /* input height */, 112 /* input width */,
-    v4 /* input */, v5 /* output */,
+    v4.data() /* input */, v5.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #4" << std::endl;
@@ -1709,7 +1710,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op5,
     1 /* batch size */, 56 /* input height */, 56 /* input width */,
-    v5 /* input */, v6 /* output */,
+    v5.data() /* input */, v6.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #5" << std::endl;
@@ -1719,7 +1720,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op6,
     1 /* batch size */, 56 /* input height */, 56 /* input width */,
-    v6 /* input */, v7 /* output */,
+    v6.data() /* input */, v7.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #6" << std::endl;
@@ -1729,7 +1730,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op7,
     1 /* batch size */, 56 /* input height */, 56 /* input width */,
-    v7 /* input */, v8 /* output */,
+    v7.data() /* input */, v8.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #7" << std::endl;
@@ -1739,7 +1740,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op8,
     1 /* batch size */, 56 /* input height */, 56 /* input width */,
-    v8 /* input */, v9 /* output */,
+    v8.data() /* input */, v9.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #8" << std::endl;
@@ -1752,7 +1753,7 @@
     status = xnn_setup_add_nd_f32(
       op9,
       4, a_shape, 4, b_shape,
-      v9 /* a */, v6 /* b */, v10 /* output */,
+      v9.data() /* a */, v6.data() /* b */, v10.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -1763,7 +1764,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op10,
     1 /* batch size */, 56 /* input height */, 56 /* input width */,
-    v10 /* input */, v11 /* output */,
+    v10.data() /* input */, v11.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #10" << std::endl;
@@ -1773,7 +1774,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op11,
     1 /* batch size */, 56 /* input height */, 56 /* input width */,
-    v11 /* input */, v12 /* output */,
+    v11.data() /* input */, v12.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #11" << std::endl;
@@ -1783,7 +1784,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op12,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v12 /* input */, v13 /* output */,
+    v12.data() /* input */, v13.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #12" << std::endl;
@@ -1793,7 +1794,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op13,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v13 /* input */, v14 /* output */,
+    v13.data() /* input */, v14.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #13" << std::endl;
@@ -1803,7 +1804,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op14,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v14 /* input */, v15 /* output */,
+    v14.data() /* input */, v15.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #14" << std::endl;
@@ -1813,7 +1814,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op15,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v15 /* input */, v16 /* output */,
+    v15.data() /* input */, v16.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #15" << std::endl;
@@ -1826,7 +1827,7 @@
     status = xnn_setup_add_nd_f32(
       op16,
       4, a_shape, 4, b_shape,
-      v16 /* a */, v13 /* b */, v17 /* output */,
+      v16.data() /* a */, v13.data() /* b */, v17.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -1837,7 +1838,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op17,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v17 /* input */, v18 /* output */,
+    v17.data() /* input */, v18.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #17" << std::endl;
@@ -1847,7 +1848,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op18,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v18 /* input */, v19 /* output */,
+    v18.data() /* input */, v19.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #18" << std::endl;
@@ -1857,7 +1858,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op19,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v19 /* input */, v20 /* output */,
+    v19.data() /* input */, v20.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #19" << std::endl;
@@ -1870,7 +1871,7 @@
     status = xnn_setup_add_nd_f32(
       op20,
       4, a_shape, 4, b_shape,
-      v20 /* a */, v17 /* b */, v21 /* output */,
+      v20.data() /* a */, v17.data() /* b */, v21.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -1881,7 +1882,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op21,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v21 /* input */, v22 /* output */,
+    v21.data() /* input */, v22.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #21" << std::endl;
@@ -1891,7 +1892,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op22,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v22 /* input */, v23 /* output */,
+    v22.data() /* input */, v23.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #22" << std::endl;
@@ -1901,7 +1902,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op23,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v23 /* input */, v24 /* output */,
+    v23.data() /* input */, v24.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #23" << std::endl;
@@ -1911,7 +1912,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op24,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v24 /* input */, v25 /* output */,
+    v24.data() /* input */, v25.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #24" << std::endl;
@@ -1921,7 +1922,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op25,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v25 /* input */, v26 /* output */,
+    v25.data() /* input */, v26.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #25" << std::endl;
@@ -1931,7 +1932,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op26,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v26 /* input */, v27 /* output */,
+    v26.data() /* input */, v27.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #26" << std::endl;
@@ -1944,7 +1945,7 @@
     status = xnn_setup_add_nd_f32(
       op27,
       4, a_shape, 4, b_shape,
-      v27 /* a */, v24 /* b */, v28 /* output */,
+      v27.data() /* a */, v24.data() /* b */, v28.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -1955,7 +1956,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op28,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v28 /* input */, v29 /* output */,
+    v28.data() /* input */, v29.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #28" << std::endl;
@@ -1965,7 +1966,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op29,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v29 /* input */, v30 /* output */,
+    v29.data() /* input */, v30.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #29" << std::endl;
@@ -1975,7 +1976,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op30,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v30 /* input */, v31 /* output */,
+    v30.data() /* input */, v31.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #30" << std::endl;
@@ -1988,7 +1989,7 @@
     status = xnn_setup_add_nd_f32(
       op31,
       4, a_shape, 4, b_shape,
-      v31 /* a */, v28 /* b */, v32 /* output */,
+      v31.data() /* a */, v28.data() /* b */, v32.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -1999,7 +2000,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op32,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v32 /* input */, v33 /* output */,
+    v32.data() /* input */, v33.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #32" << std::endl;
@@ -2009,7 +2010,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op33,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v33 /* input */, v34 /* output */,
+    v33.data() /* input */, v34.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #33" << std::endl;
@@ -2019,7 +2020,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op34,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v34 /* input */, v35 /* output */,
+    v34.data() /* input */, v35.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #34" << std::endl;
@@ -2032,7 +2033,7 @@
     status = xnn_setup_add_nd_f32(
       op35,
       4, a_shape, 4, b_shape,
-      v35 /* a */, v32 /* b */, v36 /* output */,
+      v35.data() /* a */, v32.data() /* b */, v36.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2043,7 +2044,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op36,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v36 /* input */, v37 /* output */,
+    v36.data() /* input */, v37.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #36" << std::endl;
@@ -2053,7 +2054,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op37,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v37 /* input */, v38 /* output */,
+    v37.data() /* input */, v38.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #37" << std::endl;
@@ -2063,7 +2064,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op38,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v38 /* input */, v39 /* output */,
+    v38.data() /* input */, v39.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #38" << std::endl;
@@ -2073,7 +2074,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op39,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v39 /* input */, v40 /* output */,
+    v39.data() /* input */, v40.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #39" << std::endl;
@@ -2083,7 +2084,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op40,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v40 /* input */, v41 /* output */,
+    v40.data() /* input */, v41.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #40" << std::endl;
@@ -2093,7 +2094,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op41,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v41 /* input */, v42 /* output */,
+    v41.data() /* input */, v42.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #41" << std::endl;
@@ -2106,7 +2107,7 @@
     status = xnn_setup_add_nd_f32(
       op42,
       4, a_shape, 4, b_shape,
-      v42 /* a */, v39 /* b */, v43 /* output */,
+      v42.data() /* a */, v39.data() /* b */, v43.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2117,7 +2118,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op43,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v43 /* input */, v44 /* output */,
+    v43.data() /* input */, v44.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #43" << std::endl;
@@ -2127,7 +2128,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op44,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v44 /* input */, v45 /* output */,
+    v44.data() /* input */, v45.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #44" << std::endl;
@@ -2137,7 +2138,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op45,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v45 /* input */, v46 /* output */,
+    v45.data() /* input */, v46.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #45" << std::endl;
@@ -2150,7 +2151,7 @@
     status = xnn_setup_add_nd_f32(
       op46,
       4, a_shape, 4, b_shape,
-      v46 /* a */, v43 /* b */, v47 /* output */,
+      v46.data() /* a */, v43.data() /* b */, v47.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2161,7 +2162,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op47,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v47 /* input */, v48 /* output */,
+    v47.data() /* input */, v48.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #47" << std::endl;
@@ -2171,7 +2172,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op48,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v48 /* input */, v49 /* output */,
+    v48.data() /* input */, v49.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #48" << std::endl;
@@ -2181,7 +2182,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op49,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v49 /* input */, v50 /* output */,
+    v49.data() /* input */, v50.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #49" << std::endl;
@@ -2191,7 +2192,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op50,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v50 /* input */, v51 /* output */,
+    v50.data() /* input */, v51.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #50" << std::endl;
@@ -2201,7 +2202,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op51,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v51 /* input */, v52 /* output */,
+    v51.data() /* input */, v52.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #51" << std::endl;
@@ -2211,7 +2212,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op52,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v52 /* input */, v53 /* output */,
+    v52.data() /* input */, v53.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #52" << std::endl;
@@ -2224,7 +2225,7 @@
     status = xnn_setup_add_nd_f32(
       op53,
       4, a_shape, 4, b_shape,
-      v53 /* a */, v50 /* b */, v54 /* output */,
+      v53.data() /* a */, v50.data() /* b */, v54.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2235,7 +2236,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op54,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v54 /* input */, v55 /* output */,
+    v54.data() /* input */, v55.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #54" << std::endl;
@@ -2245,7 +2246,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op55,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v55 /* input */, v56 /* output */,
+    v55.data() /* input */, v56.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #55" << std::endl;
@@ -2255,7 +2256,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op56,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v56 /* input */, v57 /* output */,
+    v56.data() /* input */, v57.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #56" << std::endl;
@@ -2268,7 +2269,7 @@
     status = xnn_setup_add_nd_f32(
       op57,
       4, a_shape, 4, b_shape,
-      v57 /* a */, v54 /* b */, v58 /* output */,
+      v57.data() /* a */, v54.data() /* b */, v58.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2279,7 +2280,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op58,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v58 /* input */, v59 /* output */,
+    v58.data() /* input */, v59.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #58" << std::endl;
@@ -2289,7 +2290,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op59,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v59 /* input */, v60 /* output */,
+    v59.data() /* input */, v60.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #59" << std::endl;
@@ -2299,7 +2300,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op60,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v60 /* input */, v61 /* output */,
+    v60.data() /* input */, v61.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #60" << std::endl;
@@ -2309,7 +2310,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op61,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v61 /* input */, v62 /* output */,
+    v61.data() /* input */, v62.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #61" << std::endl;
@@ -2319,7 +2320,7 @@
   status = xnn_setup_global_average_pooling_nwc_f32(
     op62,
     1 /* batch size */, 49 /* width */,
-    v62 /* input */, v63 /* output */,
+    v62.data() /* input */, v63.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #62" << std::endl;
@@ -2329,7 +2330,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op63,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v63 /* input */, v64 /* output */,
+    v63.data() /* input */, v64.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #63" << std::endl;
diff --git a/models/fp32-mobilenet-v3-large.cc b/models/fp32-mobilenet-v3-large.cc
index 7d4bafc..03a96d1 100644
--- a/models/fp32-mobilenet-v3-large.cc
+++ b/models/fp32-mobilenet-v3-large.cc
@@ -5,6 +5,7 @@
 
 #include <xnnpack.h>
 
+#include <array>
 #include <algorithm>
 #include <functional>
 #include <iostream>
@@ -16,494 +17,494 @@
 namespace models {
 
 ExecutionPlan FP32MobileNetV3Large(pthreadpool_t threadpool) {
-  alignas(16) static float v0[150528];
-  alignas(16) static float v1[200704];
-  alignas(16) static float v2[200704];
-  alignas(16) static float v3[200704];
-  alignas(16) static float v4[200704];
-  alignas(16) static float v5[200704];
-  alignas(16) static float v6[802816];
-  alignas(16) static float v7[200704];
-  alignas(16) static float v8[75264];
-  alignas(16) static float v9[225792];
-  alignas(16) static float v10[225792];
-  alignas(16) static float v11[75264];
-  alignas(16) static float v12[75264];
-  alignas(16) static float v13[225792];
-  alignas(16) static float v14[56448];
-  alignas(16) static float v15[72];
-  alignas(16) static float v16[24];
-  alignas(16) static float v17[72];
-  alignas(16) static float v18[56448];
-  alignas(16) static float v19[31360];
-  alignas(16) static float v20[94080];
-  alignas(16) static float v21[94080];
-  alignas(16) static float v22[120];
-  alignas(16) static float v23[32];
-  alignas(16) static float v24[120];
-  alignas(16) static float v25[94080];
-  alignas(16) static float v26[31360];
-  alignas(16) static float v27[31360];
-  alignas(16) static float v28[94080];
-  alignas(16) static float v29[94080];
-  alignas(16) static float v30[120];
-  alignas(16) static float v31[32];
-  alignas(16) static float v32[120];
-  alignas(16) static float v33[94080];
-  alignas(16) static float v34[31360];
-  alignas(16) static float v35[31360];
-  alignas(16) static float v36[188160];
-  alignas(16) static float v37[188160];
-  alignas(16) static float v38[47040];
-  alignas(16) static float v39[47040];
-  alignas(16) static float v40[15680];
-  alignas(16) static float v41[39200];
-  alignas(16) static float v42[39200];
-  alignas(16) static float v43[39200];
-  alignas(16) static float v44[39200];
-  alignas(16) static float v45[15680];
-  alignas(16) static float v46[15680];
-  alignas(16) static float v47[36064];
-  alignas(16) static float v48[36064];
-  alignas(16) static float v49[36064];
-  alignas(16) static float v50[36064];
-  alignas(16) static float v51[15680];
-  alignas(16) static float v52[15680];
-  alignas(16) static float v53[36064];
-  alignas(16) static float v54[36064];
-  alignas(16) static float v55[36064];
-  alignas(16) static float v56[36064];
-  alignas(16) static float v57[15680];
-  alignas(16) static float v58[15680];
-  alignas(16) static float v59[94080];
-  alignas(16) static float v60[94080];
-  alignas(16) static float v61[94080];
-  alignas(16) static float v62[94080];
-  alignas(16) static float v63[480];
-  alignas(16) static float v64[120];
-  alignas(16) static float v65[480];
-  alignas(16) static float v66[94080];
-  alignas(16) static float v67[21952];
-  alignas(16) static float v68[131712];
-  alignas(16) static float v69[131712];
-  alignas(16) static float v70[131712];
-  alignas(16) static float v71[131712];
-  alignas(16) static float v72[672];
-  alignas(16) static float v73[168];
-  alignas(16) static float v74[672];
-  alignas(16) static float v75[131712];
-  alignas(16) static float v76[21952];
-  alignas(16) static float v77[21952];
-  alignas(16) static float v78[131712];
-  alignas(16) static float v79[131712];
-  alignas(16) static float v80[32928];
-  alignas(16) static float v81[32928];
-  alignas(16) static float v82[672];
-  alignas(16) static float v83[168];
-  alignas(16) static float v84[672];
-  alignas(16) static float v85[32928];
-  alignas(16) static float v86[7840];
-  alignas(16) static float v87[47040];
-  alignas(16) static float v88[47040];
-  alignas(16) static float v89[47040];
-  alignas(16) static float v90[47040];
-  alignas(16) static float v91[960];
-  alignas(16) static float v92[240];
-  alignas(16) static float v93[960];
-  alignas(16) static float v94[47040];
-  alignas(16) static float v95[7840];
-  alignas(16) static float v96[7840];
-  alignas(16) static float v97[47040];
-  alignas(16) static float v98[47040];
-  alignas(16) static float v99[47040];
-  alignas(16) static float v100[47040];
-  alignas(16) static float v101[960];
-  alignas(16) static float v102[240];
-  alignas(16) static float v103[960];
-  alignas(16) static float v104[47040];
-  alignas(16) static float v105[7840];
-  alignas(16) static float v106[7840];
-  alignas(16) static float v107[47040];
-  alignas(16) static float v108[47040];
-  alignas(16) static float v109[960];
-  alignas(16) static float v110[1280];
-  alignas(16) static float v111[1280];
-  alignas(16) static float v112[1280];
-  alignas(16) static float v113[1001];
-  alignas(16) static float w114[432];
-  alignas(16) static float w115[16];
-  alignas(16) static float w116[144];
-  alignas(16) static float w117[16];
-  alignas(16) static float w118[256];
-  alignas(16) static float w119[16];
-  alignas(16) static float w120[1024];
-  alignas(16) static float w121[64];
-  alignas(16) static float w122[576];
-  alignas(16) static float w123[64];
-  alignas(16) static float w124[1536];
-  alignas(16) static float w125[24];
-  alignas(16) static float w126[1728];
-  alignas(16) static float w127[72];
-  alignas(16) static float w128[648];
-  alignas(16) static float w129[72];
-  alignas(16) static float w130[1728];
-  alignas(16) static float w131[24];
-  alignas(16) static float w132[1728];
-  alignas(16) static float w133[72];
-  alignas(16) static float w134[1800];
-  alignas(16) static float w135[72];
-  alignas(16) static float w136[1728];
-  alignas(16) static float w137[24];
-  alignas(16) static float w138[1728];
-  alignas(16) static float w139[72];
-  alignas(16) static float w140[2880];
-  alignas(16) static float w141[40];
-  alignas(16) static float w142[4800];
-  alignas(16) static float w143[120];
-  alignas(16) static float w144[3000];
-  alignas(16) static float w145[120];
-  alignas(16) static float w146[3840];
-  alignas(16) static float w147[32];
-  alignas(16) static float w148[3840];
-  alignas(16) static float w149[120];
-  alignas(16) static float w150[4800];
-  alignas(16) static float w151[40];
-  alignas(16) static float w152[4800];
-  alignas(16) static float w153[120];
-  alignas(16) static float w154[3000];
-  alignas(16) static float w155[120];
-  alignas(16) static float w156[3840];
-  alignas(16) static float w157[32];
-  alignas(16) static float w158[3840];
-  alignas(16) static float w159[120];
-  alignas(16) static float w160[4800];
-  alignas(16) static float w161[40];
-  alignas(16) static float w162[9600];
-  alignas(16) static float w163[240];
-  alignas(16) static float w164[2160];
-  alignas(16) static float w165[240];
-  alignas(16) static float w166[19200];
-  alignas(16) static float w167[80];
-  alignas(16) static float w168[16000];
-  alignas(16) static float w169[200];
-  alignas(16) static float w170[1800];
-  alignas(16) static float w171[200];
-  alignas(16) static float w172[16000];
-  alignas(16) static float w173[80];
-  alignas(16) static float w174[14720];
-  alignas(16) static float w175[184];
-  alignas(16) static float w176[1656];
-  alignas(16) static float w177[184];
-  alignas(16) static float w178[14720];
-  alignas(16) static float w179[80];
-  alignas(16) static float w180[14720];
-  alignas(16) static float w181[184];
-  alignas(16) static float w182[1656];
-  alignas(16) static float w183[184];
-  alignas(16) static float w184[14720];
-  alignas(16) static float w185[80];
-  alignas(16) static float w186[38400];
-  alignas(16) static float w187[480];
-  alignas(16) static float w188[4320];
-  alignas(16) static float w189[480];
-  alignas(16) static float w190[57600];
-  alignas(16) static float w191[120];
-  alignas(16) static float w192[57600];
-  alignas(16) static float w193[480];
-  alignas(16) static float w194[53760];
-  alignas(16) static float w195[112];
-  alignas(16) static float w196[75264];
-  alignas(16) static float w197[672];
-  alignas(16) static float w198[6048];
-  alignas(16) static float w199[672];
-  alignas(16) static float w200[112896];
-  alignas(16) static float w201[168];
-  alignas(16) static float w202[112896];
-  alignas(16) static float w203[672];
-  alignas(16) static float w204[75264];
-  alignas(16) static float w205[112];
-  alignas(16) static float w206[75264];
-  alignas(16) static float w207[672];
-  alignas(16) static float w208[16800];
-  alignas(16) static float w209[672];
-  alignas(16) static float w210[112896];
-  alignas(16) static float w211[168];
-  alignas(16) static float w212[112896];
-  alignas(16) static float w213[672];
-  alignas(16) static float w214[107520];
-  alignas(16) static float w215[160];
-  alignas(16) static float w216[153600];
-  alignas(16) static float w217[960];
-  alignas(16) static float w218[24000];
-  alignas(16) static float w219[960];
-  alignas(16) static float w220[230400];
-  alignas(16) static float w221[240];
-  alignas(16) static float w222[230400];
-  alignas(16) static float w223[960];
-  alignas(16) static float w224[153600];
-  alignas(16) static float w225[160];
-  alignas(16) static float w226[153600];
-  alignas(16) static float w227[960];
-  alignas(16) static float w228[24000];
-  alignas(16) static float w229[960];
-  alignas(16) static float w230[230400];
-  alignas(16) static float w231[240];
-  alignas(16) static float w232[230400];
-  alignas(16) static float w233[960];
-  alignas(16) static float w234[153600];
-  alignas(16) static float w235[160];
-  alignas(16) static float w236[153600];
-  alignas(16) static float w237[960];
-  alignas(16) static float w238[1228800];
-  alignas(16) static float w239[1280];
-  alignas(16) static float w240[1281280];
-  alignas(16) static float w241[1001];
+  alignas(16) static std::array<float, 150528> v0;
+  alignas(16) static std::array<float, 200704> v1;
+  alignas(16) static std::array<float, 200704> v2;
+  alignas(16) static std::array<float, 200704> v3;
+  alignas(16) static std::array<float, 200704> v4;
+  alignas(16) static std::array<float, 200704> v5;
+  alignas(16) static std::array<float, 802816> v6;
+  alignas(16) static std::array<float, 200704> v7;
+  alignas(16) static std::array<float, 75264> v8;
+  alignas(16) static std::array<float, 225792> v9;
+  alignas(16) static std::array<float, 225792> v10;
+  alignas(16) static std::array<float, 75264> v11;
+  alignas(16) static std::array<float, 75264> v12;
+  alignas(16) static std::array<float, 225792> v13;
+  alignas(16) static std::array<float, 56448> v14;
+  alignas(16) static std::array<float, 72> v15;
+  alignas(16) static std::array<float, 24> v16;
+  alignas(16) static std::array<float, 72> v17;
+  alignas(16) static std::array<float, 56448> v18;
+  alignas(16) static std::array<float, 31360> v19;
+  alignas(16) static std::array<float, 94080> v20;
+  alignas(16) static std::array<float, 94080> v21;
+  alignas(16) static std::array<float, 120> v22;
+  alignas(16) static std::array<float, 32> v23;
+  alignas(16) static std::array<float, 120> v24;
+  alignas(16) static std::array<float, 94080> v25;
+  alignas(16) static std::array<float, 31360> v26;
+  alignas(16) static std::array<float, 31360> v27;
+  alignas(16) static std::array<float, 94080> v28;
+  alignas(16) static std::array<float, 94080> v29;
+  alignas(16) static std::array<float, 120> v30;
+  alignas(16) static std::array<float, 32> v31;
+  alignas(16) static std::array<float, 120> v32;
+  alignas(16) static std::array<float, 94080> v33;
+  alignas(16) static std::array<float, 31360> v34;
+  alignas(16) static std::array<float, 31360> v35;
+  alignas(16) static std::array<float, 188160> v36;
+  alignas(16) static std::array<float, 188160> v37;
+  alignas(16) static std::array<float, 47040> v38;
+  alignas(16) static std::array<float, 47040> v39;
+  alignas(16) static std::array<float, 15680> v40;
+  alignas(16) static std::array<float, 39200> v41;
+  alignas(16) static std::array<float, 39200> v42;
+  alignas(16) static std::array<float, 39200> v43;
+  alignas(16) static std::array<float, 39200> v44;
+  alignas(16) static std::array<float, 15680> v45;
+  alignas(16) static std::array<float, 15680> v46;
+  alignas(16) static std::array<float, 36064> v47;
+  alignas(16) static std::array<float, 36064> v48;
+  alignas(16) static std::array<float, 36064> v49;
+  alignas(16) static std::array<float, 36064> v50;
+  alignas(16) static std::array<float, 15680> v51;
+  alignas(16) static std::array<float, 15680> v52;
+  alignas(16) static std::array<float, 36064> v53;
+  alignas(16) static std::array<float, 36064> v54;
+  alignas(16) static std::array<float, 36064> v55;
+  alignas(16) static std::array<float, 36064> v56;
+  alignas(16) static std::array<float, 15680> v57;
+  alignas(16) static std::array<float, 15680> v58;
+  alignas(16) static std::array<float, 94080> v59;
+  alignas(16) static std::array<float, 94080> v60;
+  alignas(16) static std::array<float, 94080> v61;
+  alignas(16) static std::array<float, 94080> v62;
+  alignas(16) static std::array<float, 480> v63;
+  alignas(16) static std::array<float, 120> v64;
+  alignas(16) static std::array<float, 480> v65;
+  alignas(16) static std::array<float, 94080> v66;
+  alignas(16) static std::array<float, 21952> v67;
+  alignas(16) static std::array<float, 131712> v68;
+  alignas(16) static std::array<float, 131712> v69;
+  alignas(16) static std::array<float, 131712> v70;
+  alignas(16) static std::array<float, 131712> v71;
+  alignas(16) static std::array<float, 672> v72;
+  alignas(16) static std::array<float, 168> v73;
+  alignas(16) static std::array<float, 672> v74;
+  alignas(16) static std::array<float, 131712> v75;
+  alignas(16) static std::array<float, 21952> v76;
+  alignas(16) static std::array<float, 21952> v77;
+  alignas(16) static std::array<float, 131712> v78;
+  alignas(16) static std::array<float, 131712> v79;
+  alignas(16) static std::array<float, 32928> v80;
+  alignas(16) static std::array<float, 32928> v81;
+  alignas(16) static std::array<float, 672> v82;
+  alignas(16) static std::array<float, 168> v83;
+  alignas(16) static std::array<float, 672> v84;
+  alignas(16) static std::array<float, 32928> v85;
+  alignas(16) static std::array<float, 7840> v86;
+  alignas(16) static std::array<float, 47040> v87;
+  alignas(16) static std::array<float, 47040> v88;
+  alignas(16) static std::array<float, 47040> v89;
+  alignas(16) static std::array<float, 47040> v90;
+  alignas(16) static std::array<float, 960> v91;
+  alignas(16) static std::array<float, 240> v92;
+  alignas(16) static std::array<float, 960> v93;
+  alignas(16) static std::array<float, 47040> v94;
+  alignas(16) static std::array<float, 7840> v95;
+  alignas(16) static std::array<float, 7840> v96;
+  alignas(16) static std::array<float, 47040> v97;
+  alignas(16) static std::array<float, 47040> v98;
+  alignas(16) static std::array<float, 47040> v99;
+  alignas(16) static std::array<float, 47040> v100;
+  alignas(16) static std::array<float, 960> v101;
+  alignas(16) static std::array<float, 240> v102;
+  alignas(16) static std::array<float, 960> v103;
+  alignas(16) static std::array<float, 47040> v104;
+  alignas(16) static std::array<float, 7840> v105;
+  alignas(16) static std::array<float, 7840> v106;
+  alignas(16) static std::array<float, 47040> v107;
+  alignas(16) static std::array<float, 47040> v108;
+  alignas(16) static std::array<float, 960> v109;
+  alignas(16) static std::array<float, 1280> v110;
+  alignas(16) static std::array<float, 1280> v111;
+  alignas(16) static std::array<float, 1280> v112;
+  alignas(16) static std::array<float, 1001> v113;
+  alignas(16) static std::array<float, 432> w114;
+  alignas(16) static std::array<float, 16> w115;
+  alignas(16) static std::array<float, 144> w116;
+  alignas(16) static std::array<float, 16> w117;
+  alignas(16) static std::array<float, 256> w118;
+  alignas(16) static std::array<float, 16> w119;
+  alignas(16) static std::array<float, 1024> w120;
+  alignas(16) static std::array<float, 64> w121;
+  alignas(16) static std::array<float, 576> w122;
+  alignas(16) static std::array<float, 64> w123;
+  alignas(16) static std::array<float, 1536> w124;
+  alignas(16) static std::array<float, 24> w125;
+  alignas(16) static std::array<float, 1728> w126;
+  alignas(16) static std::array<float, 72> w127;
+  alignas(16) static std::array<float, 648> w128;
+  alignas(16) static std::array<float, 72> w129;
+  alignas(16) static std::array<float, 1728> w130;
+  alignas(16) static std::array<float, 24> w131;
+  alignas(16) static std::array<float, 1728> w132;
+  alignas(16) static std::array<float, 72> w133;
+  alignas(16) static std::array<float, 1800> w134;
+  alignas(16) static std::array<float, 72> w135;
+  alignas(16) static std::array<float, 1728> w136;
+  alignas(16) static std::array<float, 24> w137;
+  alignas(16) static std::array<float, 1728> w138;
+  alignas(16) static std::array<float, 72> w139;
+  alignas(16) static std::array<float, 2880> w140;
+  alignas(16) static std::array<float, 40> w141;
+  alignas(16) static std::array<float, 4800> w142;
+  alignas(16) static std::array<float, 120> w143;
+  alignas(16) static std::array<float, 3000> w144;
+  alignas(16) static std::array<float, 120> w145;
+  alignas(16) static std::array<float, 3840> w146;
+  alignas(16) static std::array<float, 32> w147;
+  alignas(16) static std::array<float, 3840> w148;
+  alignas(16) static std::array<float, 120> w149;
+  alignas(16) static std::array<float, 4800> w150;
+  alignas(16) static std::array<float, 40> w151;
+  alignas(16) static std::array<float, 4800> w152;
+  alignas(16) static std::array<float, 120> w153;
+  alignas(16) static std::array<float, 3000> w154;
+  alignas(16) static std::array<float, 120> w155;
+  alignas(16) static std::array<float, 3840> w156;
+  alignas(16) static std::array<float, 32> w157;
+  alignas(16) static std::array<float, 3840> w158;
+  alignas(16) static std::array<float, 120> w159;
+  alignas(16) static std::array<float, 4800> w160;
+  alignas(16) static std::array<float, 40> w161;
+  alignas(16) static std::array<float, 9600> w162;
+  alignas(16) static std::array<float, 240> w163;
+  alignas(16) static std::array<float, 2160> w164;
+  alignas(16) static std::array<float, 240> w165;
+  alignas(16) static std::array<float, 19200> w166;
+  alignas(16) static std::array<float, 80> w167;
+  alignas(16) static std::array<float, 16000> w168;
+  alignas(16) static std::array<float, 200> w169;
+  alignas(16) static std::array<float, 1800> w170;
+  alignas(16) static std::array<float, 200> w171;
+  alignas(16) static std::array<float, 16000> w172;
+  alignas(16) static std::array<float, 80> w173;
+  alignas(16) static std::array<float, 14720> w174;
+  alignas(16) static std::array<float, 184> w175;
+  alignas(16) static std::array<float, 1656> w176;
+  alignas(16) static std::array<float, 184> w177;
+  alignas(16) static std::array<float, 14720> w178;
+  alignas(16) static std::array<float, 80> w179;
+  alignas(16) static std::array<float, 14720> w180;
+  alignas(16) static std::array<float, 184> w181;
+  alignas(16) static std::array<float, 1656> w182;
+  alignas(16) static std::array<float, 184> w183;
+  alignas(16) static std::array<float, 14720> w184;
+  alignas(16) static std::array<float, 80> w185;
+  alignas(16) static std::array<float, 38400> w186;
+  alignas(16) static std::array<float, 480> w187;
+  alignas(16) static std::array<float, 4320> w188;
+  alignas(16) static std::array<float, 480> w189;
+  alignas(16) static std::array<float, 57600> w190;
+  alignas(16) static std::array<float, 120> w191;
+  alignas(16) static std::array<float, 57600> w192;
+  alignas(16) static std::array<float, 480> w193;
+  alignas(16) static std::array<float, 53760> w194;
+  alignas(16) static std::array<float, 112> w195;
+  alignas(16) static std::array<float, 75264> w196;
+  alignas(16) static std::array<float, 672> w197;
+  alignas(16) static std::array<float, 6048> w198;
+  alignas(16) static std::array<float, 672> w199;
+  alignas(16) static std::array<float, 112896> w200;
+  alignas(16) static std::array<float, 168> w201;
+  alignas(16) static std::array<float, 112896> w202;
+  alignas(16) static std::array<float, 672> w203;
+  alignas(16) static std::array<float, 75264> w204;
+  alignas(16) static std::array<float, 112> w205;
+  alignas(16) static std::array<float, 75264> w206;
+  alignas(16) static std::array<float, 672> w207;
+  alignas(16) static std::array<float, 16800> w208;
+  alignas(16) static std::array<float, 672> w209;
+  alignas(16) static std::array<float, 112896> w210;
+  alignas(16) static std::array<float, 168> w211;
+  alignas(16) static std::array<float, 112896> w212;
+  alignas(16) static std::array<float, 672> w213;
+  alignas(16) static std::array<float, 107520> w214;
+  alignas(16) static std::array<float, 160> w215;
+  alignas(16) static std::array<float, 153600> w216;
+  alignas(16) static std::array<float, 960> w217;
+  alignas(16) static std::array<float, 24000> w218;
+  alignas(16) static std::array<float, 960> w219;
+  alignas(16) static std::array<float, 230400> w220;
+  alignas(16) static std::array<float, 240> w221;
+  alignas(16) static std::array<float, 230400> w222;
+  alignas(16) static std::array<float, 960> w223;
+  alignas(16) static std::array<float, 153600> w224;
+  alignas(16) static std::array<float, 160> w225;
+  alignas(16) static std::array<float, 153600> w226;
+  alignas(16) static std::array<float, 960> w227;
+  alignas(16) static std::array<float, 24000> w228;
+  alignas(16) static std::array<float, 960> w229;
+  alignas(16) static std::array<float, 230400> w230;
+  alignas(16) static std::array<float, 240> w231;
+  alignas(16) static std::array<float, 230400> w232;
+  alignas(16) static std::array<float, 960> w233;
+  alignas(16) static std::array<float, 153600> w234;
+  alignas(16) static std::array<float, 160> w235;
+  alignas(16) static std::array<float, 153600> w236;
+  alignas(16) static std::array<float, 960> w237;
+  alignas(16) static std::array<float, 1228800> w238;
+  alignas(16) static std::array<float, 1280> w239;
+  alignas(16) static std::array<float, 1281280> w240;
+  alignas(16) static std::array<float, 1001> w241;
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.0f, +1.0f), std::ref(rng));
-  std::generate(v0, v0 + 150528, std::ref(f32rng));
-  std::generate(v1, v1 + 200704, std::ref(f32rng));
-  std::generate(v2, v2 + 200704, std::ref(f32rng));
-  std::generate(v3, v3 + 200704, std::ref(f32rng));
-  std::generate(v4, v4 + 200704, std::ref(f32rng));
-  std::generate(v5, v5 + 200704, std::ref(f32rng));
-  std::generate(v6, v6 + 802816, std::ref(f32rng));
-  std::generate(v7, v7 + 200704, std::ref(f32rng));
-  std::generate(v8, v8 + 75264, std::ref(f32rng));
-  std::generate(v9, v9 + 225792, std::ref(f32rng));
-  std::generate(v10, v10 + 225792, std::ref(f32rng));
-  std::generate(v11, v11 + 75264, std::ref(f32rng));
-  std::generate(v12, v12 + 75264, std::ref(f32rng));
-  std::generate(v13, v13 + 225792, std::ref(f32rng));
-  std::generate(v14, v14 + 56448, std::ref(f32rng));
-  std::generate(v15, v15 + 72, std::ref(f32rng));
-  std::generate(v16, v16 + 24, std::ref(f32rng));
-  std::generate(v17, v17 + 72, std::ref(f32rng));
-  std::generate(v18, v18 + 56448, std::ref(f32rng));
-  std::generate(v19, v19 + 31360, std::ref(f32rng));
-  std::generate(v20, v20 + 94080, std::ref(f32rng));
-  std::generate(v21, v21 + 94080, std::ref(f32rng));
-  std::generate(v22, v22 + 120, std::ref(f32rng));
-  std::generate(v23, v23 + 32, std::ref(f32rng));
-  std::generate(v24, v24 + 120, std::ref(f32rng));
-  std::generate(v25, v25 + 94080, std::ref(f32rng));
-  std::generate(v26, v26 + 31360, std::ref(f32rng));
-  std::generate(v27, v27 + 31360, std::ref(f32rng));
-  std::generate(v28, v28 + 94080, std::ref(f32rng));
-  std::generate(v29, v29 + 94080, std::ref(f32rng));
-  std::generate(v30, v30 + 120, std::ref(f32rng));
-  std::generate(v31, v31 + 32, std::ref(f32rng));
-  std::generate(v32, v32 + 120, std::ref(f32rng));
-  std::generate(v33, v33 + 94080, std::ref(f32rng));
-  std::generate(v34, v34 + 31360, std::ref(f32rng));
-  std::generate(v35, v35 + 31360, std::ref(f32rng));
-  std::generate(v36, v36 + 188160, std::ref(f32rng));
-  std::generate(v37, v37 + 188160, std::ref(f32rng));
-  std::generate(v38, v38 + 47040, std::ref(f32rng));
-  std::generate(v39, v39 + 47040, std::ref(f32rng));
-  std::generate(v40, v40 + 15680, std::ref(f32rng));
-  std::generate(v41, v41 + 39200, std::ref(f32rng));
-  std::generate(v42, v42 + 39200, std::ref(f32rng));
-  std::generate(v43, v43 + 39200, std::ref(f32rng));
-  std::generate(v44, v44 + 39200, std::ref(f32rng));
-  std::generate(v45, v45 + 15680, std::ref(f32rng));
-  std::generate(v46, v46 + 15680, std::ref(f32rng));
-  std::generate(v47, v47 + 36064, std::ref(f32rng));
-  std::generate(v48, v48 + 36064, std::ref(f32rng));
-  std::generate(v49, v49 + 36064, std::ref(f32rng));
-  std::generate(v50, v50 + 36064, std::ref(f32rng));
-  std::generate(v51, v51 + 15680, std::ref(f32rng));
-  std::generate(v52, v52 + 15680, std::ref(f32rng));
-  std::generate(v53, v53 + 36064, std::ref(f32rng));
-  std::generate(v54, v54 + 36064, std::ref(f32rng));
-  std::generate(v55, v55 + 36064, std::ref(f32rng));
-  std::generate(v56, v56 + 36064, std::ref(f32rng));
-  std::generate(v57, v57 + 15680, std::ref(f32rng));
-  std::generate(v58, v58 + 15680, std::ref(f32rng));
-  std::generate(v59, v59 + 94080, std::ref(f32rng));
-  std::generate(v60, v60 + 94080, std::ref(f32rng));
-  std::generate(v61, v61 + 94080, std::ref(f32rng));
-  std::generate(v62, v62 + 94080, std::ref(f32rng));
-  std::generate(v63, v63 + 480, std::ref(f32rng));
-  std::generate(v64, v64 + 120, std::ref(f32rng));
-  std::generate(v65, v65 + 480, std::ref(f32rng));
-  std::generate(v66, v66 + 94080, std::ref(f32rng));
-  std::generate(v67, v67 + 21952, std::ref(f32rng));
-  std::generate(v68, v68 + 131712, std::ref(f32rng));
-  std::generate(v69, v69 + 131712, std::ref(f32rng));
-  std::generate(v70, v70 + 131712, std::ref(f32rng));
-  std::generate(v71, v71 + 131712, std::ref(f32rng));
-  std::generate(v72, v72 + 672, std::ref(f32rng));
-  std::generate(v73, v73 + 168, std::ref(f32rng));
-  std::generate(v74, v74 + 672, std::ref(f32rng));
-  std::generate(v75, v75 + 131712, std::ref(f32rng));
-  std::generate(v76, v76 + 21952, std::ref(f32rng));
-  std::generate(v77, v77 + 21952, std::ref(f32rng));
-  std::generate(v78, v78 + 131712, std::ref(f32rng));
-  std::generate(v79, v79 + 131712, std::ref(f32rng));
-  std::generate(v80, v80 + 32928, std::ref(f32rng));
-  std::generate(v81, v81 + 32928, std::ref(f32rng));
-  std::generate(v82, v82 + 672, std::ref(f32rng));
-  std::generate(v83, v83 + 168, std::ref(f32rng));
-  std::generate(v84, v84 + 672, std::ref(f32rng));
-  std::generate(v85, v85 + 32928, std::ref(f32rng));
-  std::generate(v86, v86 + 7840, std::ref(f32rng));
-  std::generate(v87, v87 + 47040, std::ref(f32rng));
-  std::generate(v88, v88 + 47040, std::ref(f32rng));
-  std::generate(v89, v89 + 47040, std::ref(f32rng));
-  std::generate(v90, v90 + 47040, std::ref(f32rng));
-  std::generate(v91, v91 + 960, std::ref(f32rng));
-  std::generate(v92, v92 + 240, std::ref(f32rng));
-  std::generate(v93, v93 + 960, std::ref(f32rng));
-  std::generate(v94, v94 + 47040, std::ref(f32rng));
-  std::generate(v95, v95 + 7840, std::ref(f32rng));
-  std::generate(v96, v96 + 7840, std::ref(f32rng));
-  std::generate(v97, v97 + 47040, std::ref(f32rng));
-  std::generate(v98, v98 + 47040, std::ref(f32rng));
-  std::generate(v99, v99 + 47040, std::ref(f32rng));
-  std::generate(v100, v100 + 47040, std::ref(f32rng));
-  std::generate(v101, v101 + 960, std::ref(f32rng));
-  std::generate(v102, v102 + 240, std::ref(f32rng));
-  std::generate(v103, v103 + 960, std::ref(f32rng));
-  std::generate(v104, v104 + 47040, std::ref(f32rng));
-  std::generate(v105, v105 + 7840, std::ref(f32rng));
-  std::generate(v106, v106 + 7840, std::ref(f32rng));
-  std::generate(v107, v107 + 47040, std::ref(f32rng));
-  std::generate(v108, v108 + 47040, std::ref(f32rng));
-  std::generate(v109, v109 + 960, std::ref(f32rng));
-  std::generate(v110, v110 + 1280, std::ref(f32rng));
-  std::generate(v111, v111 + 1280, std::ref(f32rng));
-  std::generate(v112, v112 + 1280, std::ref(f32rng));
-  std::generate(v113, v113 + 1001, std::ref(f32rng));
-  std::generate(w114, w114 + 432, std::ref(f32rng));
-  std::generate(w115, w115 + 16, std::ref(f32rng));
-  std::generate(w116, w116 + 144, std::ref(f32rng));
-  std::generate(w117, w117 + 16, std::ref(f32rng));
-  std::generate(w118, w118 + 256, std::ref(f32rng));
-  std::generate(w119, w119 + 16, std::ref(f32rng));
-  std::generate(w120, w120 + 1024, std::ref(f32rng));
-  std::generate(w121, w121 + 64, std::ref(f32rng));
-  std::generate(w122, w122 + 576, std::ref(f32rng));
-  std::generate(w123, w123 + 64, std::ref(f32rng));
-  std::generate(w124, w124 + 1536, std::ref(f32rng));
-  std::generate(w125, w125 + 24, std::ref(f32rng));
-  std::generate(w126, w126 + 1728, std::ref(f32rng));
-  std::generate(w127, w127 + 72, std::ref(f32rng));
-  std::generate(w128, w128 + 648, std::ref(f32rng));
-  std::generate(w129, w129 + 72, std::ref(f32rng));
-  std::generate(w130, w130 + 1728, std::ref(f32rng));
-  std::generate(w131, w131 + 24, std::ref(f32rng));
-  std::generate(w132, w132 + 1728, std::ref(f32rng));
-  std::generate(w133, w133 + 72, std::ref(f32rng));
-  std::generate(w134, w134 + 1800, std::ref(f32rng));
-  std::generate(w135, w135 + 72, std::ref(f32rng));
-  std::generate(w136, w136 + 1728, std::ref(f32rng));
-  std::generate(w137, w137 + 24, std::ref(f32rng));
-  std::generate(w138, w138 + 1728, std::ref(f32rng));
-  std::generate(w139, w139 + 72, std::ref(f32rng));
-  std::generate(w140, w140 + 2880, std::ref(f32rng));
-  std::generate(w141, w141 + 40, std::ref(f32rng));
-  std::generate(w142, w142 + 4800, std::ref(f32rng));
-  std::generate(w143, w143 + 120, std::ref(f32rng));
-  std::generate(w144, w144 + 3000, std::ref(f32rng));
-  std::generate(w145, w145 + 120, std::ref(f32rng));
-  std::generate(w146, w146 + 3840, std::ref(f32rng));
-  std::generate(w147, w147 + 32, std::ref(f32rng));
-  std::generate(w148, w148 + 3840, std::ref(f32rng));
-  std::generate(w149, w149 + 120, std::ref(f32rng));
-  std::generate(w150, w150 + 4800, std::ref(f32rng));
-  std::generate(w151, w151 + 40, std::ref(f32rng));
-  std::generate(w152, w152 + 4800, std::ref(f32rng));
-  std::generate(w153, w153 + 120, std::ref(f32rng));
-  std::generate(w154, w154 + 3000, std::ref(f32rng));
-  std::generate(w155, w155 + 120, std::ref(f32rng));
-  std::generate(w156, w156 + 3840, std::ref(f32rng));
-  std::generate(w157, w157 + 32, std::ref(f32rng));
-  std::generate(w158, w158 + 3840, std::ref(f32rng));
-  std::generate(w159, w159 + 120, std::ref(f32rng));
-  std::generate(w160, w160 + 4800, std::ref(f32rng));
-  std::generate(w161, w161 + 40, std::ref(f32rng));
-  std::generate(w162, w162 + 9600, std::ref(f32rng));
-  std::generate(w163, w163 + 240, std::ref(f32rng));
-  std::generate(w164, w164 + 2160, std::ref(f32rng));
-  std::generate(w165, w165 + 240, std::ref(f32rng));
-  std::generate(w166, w166 + 19200, std::ref(f32rng));
-  std::generate(w167, w167 + 80, std::ref(f32rng));
-  std::generate(w168, w168 + 16000, std::ref(f32rng));
-  std::generate(w169, w169 + 200, std::ref(f32rng));
-  std::generate(w170, w170 + 1800, std::ref(f32rng));
-  std::generate(w171, w171 + 200, std::ref(f32rng));
-  std::generate(w172, w172 + 16000, std::ref(f32rng));
-  std::generate(w173, w173 + 80, std::ref(f32rng));
-  std::generate(w174, w174 + 14720, std::ref(f32rng));
-  std::generate(w175, w175 + 184, std::ref(f32rng));
-  std::generate(w176, w176 + 1656, std::ref(f32rng));
-  std::generate(w177, w177 + 184, std::ref(f32rng));
-  std::generate(w178, w178 + 14720, std::ref(f32rng));
-  std::generate(w179, w179 + 80, std::ref(f32rng));
-  std::generate(w180, w180 + 14720, std::ref(f32rng));
-  std::generate(w181, w181 + 184, std::ref(f32rng));
-  std::generate(w182, w182 + 1656, std::ref(f32rng));
-  std::generate(w183, w183 + 184, std::ref(f32rng));
-  std::generate(w184, w184 + 14720, std::ref(f32rng));
-  std::generate(w185, w185 + 80, std::ref(f32rng));
-  std::generate(w186, w186 + 38400, std::ref(f32rng));
-  std::generate(w187, w187 + 480, std::ref(f32rng));
-  std::generate(w188, w188 + 4320, std::ref(f32rng));
-  std::generate(w189, w189 + 480, std::ref(f32rng));
-  std::generate(w190, w190 + 57600, std::ref(f32rng));
-  std::generate(w191, w191 + 120, std::ref(f32rng));
-  std::generate(w192, w192 + 57600, std::ref(f32rng));
-  std::generate(w193, w193 + 480, std::ref(f32rng));
-  std::generate(w194, w194 + 53760, std::ref(f32rng));
-  std::generate(w195, w195 + 112, std::ref(f32rng));
-  std::generate(w196, w196 + 75264, std::ref(f32rng));
-  std::generate(w197, w197 + 672, std::ref(f32rng));
-  std::generate(w198, w198 + 6048, std::ref(f32rng));
-  std::generate(w199, w199 + 672, std::ref(f32rng));
-  std::generate(w200, w200 + 112896, std::ref(f32rng));
-  std::generate(w201, w201 + 168, std::ref(f32rng));
-  std::generate(w202, w202 + 112896, std::ref(f32rng));
-  std::generate(w203, w203 + 672, std::ref(f32rng));
-  std::generate(w204, w204 + 75264, std::ref(f32rng));
-  std::generate(w205, w205 + 112, std::ref(f32rng));
-  std::generate(w206, w206 + 75264, std::ref(f32rng));
-  std::generate(w207, w207 + 672, std::ref(f32rng));
-  std::generate(w208, w208 + 16800, std::ref(f32rng));
-  std::generate(w209, w209 + 672, std::ref(f32rng));
-  std::generate(w210, w210 + 112896, std::ref(f32rng));
-  std::generate(w211, w211 + 168, std::ref(f32rng));
-  std::generate(w212, w212 + 112896, std::ref(f32rng));
-  std::generate(w213, w213 + 672, std::ref(f32rng));
-  std::generate(w214, w214 + 107520, std::ref(f32rng));
-  std::generate(w215, w215 + 160, std::ref(f32rng));
-  std::generate(w216, w216 + 153600, std::ref(f32rng));
-  std::generate(w217, w217 + 960, std::ref(f32rng));
-  std::generate(w218, w218 + 24000, std::ref(f32rng));
-  std::generate(w219, w219 + 960, std::ref(f32rng));
-  std::generate(w220, w220 + 230400, std::ref(f32rng));
-  std::generate(w221, w221 + 240, std::ref(f32rng));
-  std::generate(w222, w222 + 230400, std::ref(f32rng));
-  std::generate(w223, w223 + 960, std::ref(f32rng));
-  std::generate(w224, w224 + 153600, std::ref(f32rng));
-  std::generate(w225, w225 + 160, std::ref(f32rng));
-  std::generate(w226, w226 + 153600, std::ref(f32rng));
-  std::generate(w227, w227 + 960, std::ref(f32rng));
-  std::generate(w228, w228 + 24000, std::ref(f32rng));
-  std::generate(w229, w229 + 960, std::ref(f32rng));
-  std::generate(w230, w230 + 230400, std::ref(f32rng));
-  std::generate(w231, w231 + 240, std::ref(f32rng));
-  std::generate(w232, w232 + 230400, std::ref(f32rng));
-  std::generate(w233, w233 + 960, std::ref(f32rng));
-  std::generate(w234, w234 + 153600, std::ref(f32rng));
-  std::generate(w235, w235 + 160, std::ref(f32rng));
-  std::generate(w236, w236 + 153600, std::ref(f32rng));
-  std::generate(w237, w237 + 960, std::ref(f32rng));
-  std::generate(w238, w238 + 1228800, std::ref(f32rng));
-  std::generate(w239, w239 + 1280, std::ref(f32rng));
-  std::generate(w240, w240 + 1281280, std::ref(f32rng));
-  std::generate(w241, w241 + 1001, std::ref(f32rng));
+  std::generate(v0.begin(), v0.end(), std::ref(f32rng));
+  std::generate(v1.begin(), v1.end(), std::ref(f32rng));
+  std::generate(v2.begin(), v2.end(), std::ref(f32rng));
+  std::generate(v3.begin(), v3.end(), std::ref(f32rng));
+  std::generate(v4.begin(), v4.end(), std::ref(f32rng));
+  std::generate(v5.begin(), v5.end(), std::ref(f32rng));
+  std::generate(v6.begin(), v6.end(), std::ref(f32rng));
+  std::generate(v7.begin(), v7.end(), std::ref(f32rng));
+  std::generate(v8.begin(), v8.end(), std::ref(f32rng));
+  std::generate(v9.begin(), v9.end(), std::ref(f32rng));
+  std::generate(v10.begin(), v10.end(), std::ref(f32rng));
+  std::generate(v11.begin(), v11.end(), std::ref(f32rng));
+  std::generate(v12.begin(), v12.end(), std::ref(f32rng));
+  std::generate(v13.begin(), v13.end(), std::ref(f32rng));
+  std::generate(v14.begin(), v14.end(), std::ref(f32rng));
+  std::generate(v15.begin(), v15.end(), std::ref(f32rng));
+  std::generate(v16.begin(), v16.end(), std::ref(f32rng));
+  std::generate(v17.begin(), v17.end(), std::ref(f32rng));
+  std::generate(v18.begin(), v18.end(), std::ref(f32rng));
+  std::generate(v19.begin(), v19.end(), std::ref(f32rng));
+  std::generate(v20.begin(), v20.end(), std::ref(f32rng));
+  std::generate(v21.begin(), v21.end(), std::ref(f32rng));
+  std::generate(v22.begin(), v22.end(), std::ref(f32rng));
+  std::generate(v23.begin(), v23.end(), std::ref(f32rng));
+  std::generate(v24.begin(), v24.end(), std::ref(f32rng));
+  std::generate(v25.begin(), v25.end(), std::ref(f32rng));
+  std::generate(v26.begin(), v26.end(), std::ref(f32rng));
+  std::generate(v27.begin(), v27.end(), std::ref(f32rng));
+  std::generate(v28.begin(), v28.end(), std::ref(f32rng));
+  std::generate(v29.begin(), v29.end(), std::ref(f32rng));
+  std::generate(v30.begin(), v30.end(), std::ref(f32rng));
+  std::generate(v31.begin(), v31.end(), std::ref(f32rng));
+  std::generate(v32.begin(), v32.end(), std::ref(f32rng));
+  std::generate(v33.begin(), v33.end(), std::ref(f32rng));
+  std::generate(v34.begin(), v34.end(), std::ref(f32rng));
+  std::generate(v35.begin(), v35.end(), std::ref(f32rng));
+  std::generate(v36.begin(), v36.end(), std::ref(f32rng));
+  std::generate(v37.begin(), v37.end(), std::ref(f32rng));
+  std::generate(v38.begin(), v38.end(), std::ref(f32rng));
+  std::generate(v39.begin(), v39.end(), std::ref(f32rng));
+  std::generate(v40.begin(), v40.end(), std::ref(f32rng));
+  std::generate(v41.begin(), v41.end(), std::ref(f32rng));
+  std::generate(v42.begin(), v42.end(), std::ref(f32rng));
+  std::generate(v43.begin(), v43.end(), std::ref(f32rng));
+  std::generate(v44.begin(), v44.end(), std::ref(f32rng));
+  std::generate(v45.begin(), v45.end(), std::ref(f32rng));
+  std::generate(v46.begin(), v46.end(), std::ref(f32rng));
+  std::generate(v47.begin(), v47.end(), std::ref(f32rng));
+  std::generate(v48.begin(), v48.end(), std::ref(f32rng));
+  std::generate(v49.begin(), v49.end(), std::ref(f32rng));
+  std::generate(v50.begin(), v50.end(), std::ref(f32rng));
+  std::generate(v51.begin(), v51.end(), std::ref(f32rng));
+  std::generate(v52.begin(), v52.end(), std::ref(f32rng));
+  std::generate(v53.begin(), v53.end(), std::ref(f32rng));
+  std::generate(v54.begin(), v54.end(), std::ref(f32rng));
+  std::generate(v55.begin(), v55.end(), std::ref(f32rng));
+  std::generate(v56.begin(), v56.end(), std::ref(f32rng));
+  std::generate(v57.begin(), v57.end(), std::ref(f32rng));
+  std::generate(v58.begin(), v58.end(), std::ref(f32rng));
+  std::generate(v59.begin(), v59.end(), std::ref(f32rng));
+  std::generate(v60.begin(), v60.end(), std::ref(f32rng));
+  std::generate(v61.begin(), v61.end(), std::ref(f32rng));
+  std::generate(v62.begin(), v62.end(), std::ref(f32rng));
+  std::generate(v63.begin(), v63.end(), std::ref(f32rng));
+  std::generate(v64.begin(), v64.end(), std::ref(f32rng));
+  std::generate(v65.begin(), v65.end(), std::ref(f32rng));
+  std::generate(v66.begin(), v66.end(), std::ref(f32rng));
+  std::generate(v67.begin(), v67.end(), std::ref(f32rng));
+  std::generate(v68.begin(), v68.end(), std::ref(f32rng));
+  std::generate(v69.begin(), v69.end(), std::ref(f32rng));
+  std::generate(v70.begin(), v70.end(), std::ref(f32rng));
+  std::generate(v71.begin(), v71.end(), std::ref(f32rng));
+  std::generate(v72.begin(), v72.end(), std::ref(f32rng));
+  std::generate(v73.begin(), v73.end(), std::ref(f32rng));
+  std::generate(v74.begin(), v74.end(), std::ref(f32rng));
+  std::generate(v75.begin(), v75.end(), std::ref(f32rng));
+  std::generate(v76.begin(), v76.end(), std::ref(f32rng));
+  std::generate(v77.begin(), v77.end(), std::ref(f32rng));
+  std::generate(v78.begin(), v78.end(), std::ref(f32rng));
+  std::generate(v79.begin(), v79.end(), std::ref(f32rng));
+  std::generate(v80.begin(), v80.end(), std::ref(f32rng));
+  std::generate(v81.begin(), v81.end(), std::ref(f32rng));
+  std::generate(v82.begin(), v82.end(), std::ref(f32rng));
+  std::generate(v83.begin(), v83.end(), std::ref(f32rng));
+  std::generate(v84.begin(), v84.end(), std::ref(f32rng));
+  std::generate(v85.begin(), v85.end(), std::ref(f32rng));
+  std::generate(v86.begin(), v86.end(), std::ref(f32rng));
+  std::generate(v87.begin(), v87.end(), std::ref(f32rng));
+  std::generate(v88.begin(), v88.end(), std::ref(f32rng));
+  std::generate(v89.begin(), v89.end(), std::ref(f32rng));
+  std::generate(v90.begin(), v90.end(), std::ref(f32rng));
+  std::generate(v91.begin(), v91.end(), std::ref(f32rng));
+  std::generate(v92.begin(), v92.end(), std::ref(f32rng));
+  std::generate(v93.begin(), v93.end(), std::ref(f32rng));
+  std::generate(v94.begin(), v94.end(), std::ref(f32rng));
+  std::generate(v95.begin(), v95.end(), std::ref(f32rng));
+  std::generate(v96.begin(), v96.end(), std::ref(f32rng));
+  std::generate(v97.begin(), v97.end(), std::ref(f32rng));
+  std::generate(v98.begin(), v98.end(), std::ref(f32rng));
+  std::generate(v99.begin(), v99.end(), std::ref(f32rng));
+  std::generate(v100.begin(), v100.end(), std::ref(f32rng));
+  std::generate(v101.begin(), v101.end(), std::ref(f32rng));
+  std::generate(v102.begin(), v102.end(), std::ref(f32rng));
+  std::generate(v103.begin(), v103.end(), std::ref(f32rng));
+  std::generate(v104.begin(), v104.end(), std::ref(f32rng));
+  std::generate(v105.begin(), v105.end(), std::ref(f32rng));
+  std::generate(v106.begin(), v106.end(), std::ref(f32rng));
+  std::generate(v107.begin(), v107.end(), std::ref(f32rng));
+  std::generate(v108.begin(), v108.end(), std::ref(f32rng));
+  std::generate(v109.begin(), v109.end(), std::ref(f32rng));
+  std::generate(v110.begin(), v110.end(), std::ref(f32rng));
+  std::generate(v111.begin(), v111.end(), std::ref(f32rng));
+  std::generate(v112.begin(), v112.end(), std::ref(f32rng));
+  std::generate(v113.begin(), v113.end(), std::ref(f32rng));
+  std::generate(w114.begin(), w114.end(), std::ref(f32rng));
+  std::generate(w115.begin(), w115.end(), std::ref(f32rng));
+  std::generate(w116.begin(), w116.end(), std::ref(f32rng));
+  std::generate(w117.begin(), w117.end(), std::ref(f32rng));
+  std::generate(w118.begin(), w118.end(), std::ref(f32rng));
+  std::generate(w119.begin(), w119.end(), std::ref(f32rng));
+  std::generate(w120.begin(), w120.end(), std::ref(f32rng));
+  std::generate(w121.begin(), w121.end(), std::ref(f32rng));
+  std::generate(w122.begin(), w122.end(), std::ref(f32rng));
+  std::generate(w123.begin(), w123.end(), std::ref(f32rng));
+  std::generate(w124.begin(), w124.end(), std::ref(f32rng));
+  std::generate(w125.begin(), w125.end(), std::ref(f32rng));
+  std::generate(w126.begin(), w126.end(), std::ref(f32rng));
+  std::generate(w127.begin(), w127.end(), std::ref(f32rng));
+  std::generate(w128.begin(), w128.end(), std::ref(f32rng));
+  std::generate(w129.begin(), w129.end(), std::ref(f32rng));
+  std::generate(w130.begin(), w130.end(), std::ref(f32rng));
+  std::generate(w131.begin(), w131.end(), std::ref(f32rng));
+  std::generate(w132.begin(), w132.end(), std::ref(f32rng));
+  std::generate(w133.begin(), w133.end(), std::ref(f32rng));
+  std::generate(w134.begin(), w134.end(), std::ref(f32rng));
+  std::generate(w135.begin(), w135.end(), std::ref(f32rng));
+  std::generate(w136.begin(), w136.end(), std::ref(f32rng));
+  std::generate(w137.begin(), w137.end(), std::ref(f32rng));
+  std::generate(w138.begin(), w138.end(), std::ref(f32rng));
+  std::generate(w139.begin(), w139.end(), std::ref(f32rng));
+  std::generate(w140.begin(), w140.end(), std::ref(f32rng));
+  std::generate(w141.begin(), w141.end(), std::ref(f32rng));
+  std::generate(w142.begin(), w142.end(), std::ref(f32rng));
+  std::generate(w143.begin(), w143.end(), std::ref(f32rng));
+  std::generate(w144.begin(), w144.end(), std::ref(f32rng));
+  std::generate(w145.begin(), w145.end(), std::ref(f32rng));
+  std::generate(w146.begin(), w146.end(), std::ref(f32rng));
+  std::generate(w147.begin(), w147.end(), std::ref(f32rng));
+  std::generate(w148.begin(), w148.end(), std::ref(f32rng));
+  std::generate(w149.begin(), w149.end(), std::ref(f32rng));
+  std::generate(w150.begin(), w150.end(), std::ref(f32rng));
+  std::generate(w151.begin(), w151.end(), std::ref(f32rng));
+  std::generate(w152.begin(), w152.end(), std::ref(f32rng));
+  std::generate(w153.begin(), w153.end(), std::ref(f32rng));
+  std::generate(w154.begin(), w154.end(), std::ref(f32rng));
+  std::generate(w155.begin(), w155.end(), std::ref(f32rng));
+  std::generate(w156.begin(), w156.end(), std::ref(f32rng));
+  std::generate(w157.begin(), w157.end(), std::ref(f32rng));
+  std::generate(w158.begin(), w158.end(), std::ref(f32rng));
+  std::generate(w159.begin(), w159.end(), std::ref(f32rng));
+  std::generate(w160.begin(), w160.end(), std::ref(f32rng));
+  std::generate(w161.begin(), w161.end(), std::ref(f32rng));
+  std::generate(w162.begin(), w162.end(), std::ref(f32rng));
+  std::generate(w163.begin(), w163.end(), std::ref(f32rng));
+  std::generate(w164.begin(), w164.end(), std::ref(f32rng));
+  std::generate(w165.begin(), w165.end(), std::ref(f32rng));
+  std::generate(w166.begin(), w166.end(), std::ref(f32rng));
+  std::generate(w167.begin(), w167.end(), std::ref(f32rng));
+  std::generate(w168.begin(), w168.end(), std::ref(f32rng));
+  std::generate(w169.begin(), w169.end(), std::ref(f32rng));
+  std::generate(w170.begin(), w170.end(), std::ref(f32rng));
+  std::generate(w171.begin(), w171.end(), std::ref(f32rng));
+  std::generate(w172.begin(), w172.end(), std::ref(f32rng));
+  std::generate(w173.begin(), w173.end(), std::ref(f32rng));
+  std::generate(w174.begin(), w174.end(), std::ref(f32rng));
+  std::generate(w175.begin(), w175.end(), std::ref(f32rng));
+  std::generate(w176.begin(), w176.end(), std::ref(f32rng));
+  std::generate(w177.begin(), w177.end(), std::ref(f32rng));
+  std::generate(w178.begin(), w178.end(), std::ref(f32rng));
+  std::generate(w179.begin(), w179.end(), std::ref(f32rng));
+  std::generate(w180.begin(), w180.end(), std::ref(f32rng));
+  std::generate(w181.begin(), w181.end(), std::ref(f32rng));
+  std::generate(w182.begin(), w182.end(), std::ref(f32rng));
+  std::generate(w183.begin(), w183.end(), std::ref(f32rng));
+  std::generate(w184.begin(), w184.end(), std::ref(f32rng));
+  std::generate(w185.begin(), w185.end(), std::ref(f32rng));
+  std::generate(w186.begin(), w186.end(), std::ref(f32rng));
+  std::generate(w187.begin(), w187.end(), std::ref(f32rng));
+  std::generate(w188.begin(), w188.end(), std::ref(f32rng));
+  std::generate(w189.begin(), w189.end(), std::ref(f32rng));
+  std::generate(w190.begin(), w190.end(), std::ref(f32rng));
+  std::generate(w191.begin(), w191.end(), std::ref(f32rng));
+  std::generate(w192.begin(), w192.end(), std::ref(f32rng));
+  std::generate(w193.begin(), w193.end(), std::ref(f32rng));
+  std::generate(w194.begin(), w194.end(), std::ref(f32rng));
+  std::generate(w195.begin(), w195.end(), std::ref(f32rng));
+  std::generate(w196.begin(), w196.end(), std::ref(f32rng));
+  std::generate(w197.begin(), w197.end(), std::ref(f32rng));
+  std::generate(w198.begin(), w198.end(), std::ref(f32rng));
+  std::generate(w199.begin(), w199.end(), std::ref(f32rng));
+  std::generate(w200.begin(), w200.end(), std::ref(f32rng));
+  std::generate(w201.begin(), w201.end(), std::ref(f32rng));
+  std::generate(w202.begin(), w202.end(), std::ref(f32rng));
+  std::generate(w203.begin(), w203.end(), std::ref(f32rng));
+  std::generate(w204.begin(), w204.end(), std::ref(f32rng));
+  std::generate(w205.begin(), w205.end(), std::ref(f32rng));
+  std::generate(w206.begin(), w206.end(), std::ref(f32rng));
+  std::generate(w207.begin(), w207.end(), std::ref(f32rng));
+  std::generate(w208.begin(), w208.end(), std::ref(f32rng));
+  std::generate(w209.begin(), w209.end(), std::ref(f32rng));
+  std::generate(w210.begin(), w210.end(), std::ref(f32rng));
+  std::generate(w211.begin(), w211.end(), std::ref(f32rng));
+  std::generate(w212.begin(), w212.end(), std::ref(f32rng));
+  std::generate(w213.begin(), w213.end(), std::ref(f32rng));
+  std::generate(w214.begin(), w214.end(), std::ref(f32rng));
+  std::generate(w215.begin(), w215.end(), std::ref(f32rng));
+  std::generate(w216.begin(), w216.end(), std::ref(f32rng));
+  std::generate(w217.begin(), w217.end(), std::ref(f32rng));
+  std::generate(w218.begin(), w218.end(), std::ref(f32rng));
+  std::generate(w219.begin(), w219.end(), std::ref(f32rng));
+  std::generate(w220.begin(), w220.end(), std::ref(f32rng));
+  std::generate(w221.begin(), w221.end(), std::ref(f32rng));
+  std::generate(w222.begin(), w222.end(), std::ref(f32rng));
+  std::generate(w223.begin(), w223.end(), std::ref(f32rng));
+  std::generate(w224.begin(), w224.end(), std::ref(f32rng));
+  std::generate(w225.begin(), w225.end(), std::ref(f32rng));
+  std::generate(w226.begin(), w226.end(), std::ref(f32rng));
+  std::generate(w227.begin(), w227.end(), std::ref(f32rng));
+  std::generate(w228.begin(), w228.end(), std::ref(f32rng));
+  std::generate(w229.begin(), w229.end(), std::ref(f32rng));
+  std::generate(w230.begin(), w230.end(), std::ref(f32rng));
+  std::generate(w231.begin(), w231.end(), std::ref(f32rng));
+  std::generate(w232.begin(), w232.end(), std::ref(f32rng));
+  std::generate(w233.begin(), w233.end(), std::ref(f32rng));
+  std::generate(w234.begin(), w234.end(), std::ref(f32rng));
+  std::generate(w235.begin(), w235.end(), std::ref(f32rng));
+  std::generate(w236.begin(), w236.end(), std::ref(f32rng));
+  std::generate(w237.begin(), w237.end(), std::ref(f32rng));
+  std::generate(w238.begin(), w238.end(), std::ref(f32rng));
+  std::generate(w239.begin(), w239.end(), std::ref(f32rng));
+  std::generate(w240.begin(), w240.end(), std::ref(f32rng));
+  std::generate(w241.begin(), w241.end(), std::ref(f32rng));
 
   ExecutionPlan operators;
   xnn_status status;
@@ -520,7 +521,7 @@
     16 /* output_channels_per_group */,
     3 /* input pixel stride */,
     16 /* output pixel stride */,
-    w114, w115,
+    w114.data(), w115.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op0);
@@ -555,7 +556,7 @@
     1 /* output_channels_per_group */,
     16 /* input pixel stride */,
     16 /* output pixel stride */,
-    w116, w117,
+    w116.data(), w117.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op2);
@@ -577,7 +578,7 @@
     16 /* output_channels_per_group */,
     16 /* input pixel stride */,
     16 /* output pixel stride */,
-    w118, w119,
+    w118.data(), w119.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op3);
@@ -610,7 +611,7 @@
     64 /* output_channels_per_group */,
     16 /* input pixel stride */,
     64 /* output pixel stride */,
-    w120, w121,
+    w120.data(), w121.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op5);
@@ -632,7 +633,7 @@
     1 /* output_channels_per_group */,
     64 /* input pixel stride */,
     64 /* output pixel stride */,
-    w122, w123,
+    w122.data(), w123.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op6);
@@ -654,7 +655,7 @@
     24 /* output_channels_per_group */,
     64 /* input pixel stride */,
     24 /* output pixel stride */,
-    w124, w125,
+    w124.data(), w125.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op7);
@@ -676,7 +677,7 @@
     72 /* output_channels_per_group */,
     24 /* input pixel stride */,
     72 /* output pixel stride */,
-    w126, w127,
+    w126.data(), w127.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op8);
@@ -698,7 +699,7 @@
     1 /* output_channels_per_group */,
     72 /* input pixel stride */,
     72 /* output pixel stride */,
-    w128, w129,
+    w128.data(), w129.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op9);
@@ -720,7 +721,7 @@
     24 /* output_channels_per_group */,
     72 /* input pixel stride */,
     24 /* output pixel stride */,
-    w130, w131,
+    w130.data(), w131.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op10);
@@ -753,7 +754,7 @@
     72 /* output_channels_per_group */,
     24 /* input pixel stride */,
     72 /* output pixel stride */,
-    w132, w133,
+    w132.data(), w133.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op12);
@@ -775,7 +776,7 @@
     1 /* output_channels_per_group */,
     72 /* input pixel stride */,
     72 /* output pixel stride */,
-    w134, w135,
+    w134.data(), w135.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op13);
@@ -809,7 +810,7 @@
     24 /* output_channels_per_group */,
     72 /* input pixel stride */,
     24 /* output pixel stride */,
-    w136, w137,
+    w136.data(), w137.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op15);
@@ -831,7 +832,7 @@
     72 /* output_channels_per_group */,
     24 /* input pixel stride */,
     72 /* output pixel stride */,
-    w138, w139,
+    w138.data(), w139.data(),
     0.0f /* output min */, +0x1.00014Fp+0 /* output max */,
     0 /* flags */,
     &op16);
@@ -864,7 +865,7 @@
     40 /* output_channels_per_group */,
     72 /* input pixel stride */,
     40 /* output pixel stride */,
-    w140, w141,
+    w140.data(), w141.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op18);
@@ -886,7 +887,7 @@
     120 /* output_channels_per_group */,
     40 /* input pixel stride */,
     120 /* output pixel stride */,
-    w142, w143,
+    w142.data(), w143.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op19);
@@ -908,7 +909,7 @@
     1 /* output_channels_per_group */,
     120 /* input pixel stride */,
     120 /* output pixel stride */,
-    w144, w145,
+    w144.data(), w145.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op20);
@@ -942,7 +943,7 @@
     32 /* output_channels_per_group */,
     120 /* input pixel stride */,
     32 /* output pixel stride */,
-    w146, w147,
+    w146.data(), w147.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op22);
@@ -964,7 +965,7 @@
     120 /* output_channels_per_group */,
     32 /* input pixel stride */,
     120 /* output pixel stride */,
-    w148, w149,
+    w148.data(), w149.data(),
     0.0f /* output min */, +0x1.00014Fp+0 /* output max */,
     0 /* flags */,
     &op23);
@@ -997,7 +998,7 @@
     40 /* output_channels_per_group */,
     120 /* input pixel stride */,
     40 /* output pixel stride */,
-    w150, w151,
+    w150.data(), w151.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op25);
@@ -1030,7 +1031,7 @@
     120 /* output_channels_per_group */,
     40 /* input pixel stride */,
     120 /* output pixel stride */,
-    w152, w153,
+    w152.data(), w153.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op27);
@@ -1052,7 +1053,7 @@
     1 /* output_channels_per_group */,
     120 /* input pixel stride */,
     120 /* output pixel stride */,
-    w154, w155,
+    w154.data(), w155.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op28);
@@ -1086,7 +1087,7 @@
     32 /* output_channels_per_group */,
     120 /* input pixel stride */,
     32 /* output pixel stride */,
-    w156, w157,
+    w156.data(), w157.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op30);
@@ -1108,7 +1109,7 @@
     120 /* output_channels_per_group */,
     32 /* input pixel stride */,
     120 /* output pixel stride */,
-    w158, w159,
+    w158.data(), w159.data(),
     0.0f /* output min */, +0x1.00014Fp+0 /* output max */,
     0 /* flags */,
     &op31);
@@ -1141,7 +1142,7 @@
     40 /* output_channels_per_group */,
     120 /* input pixel stride */,
     40 /* output pixel stride */,
-    w160, w161,
+    w160.data(), w161.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op33);
@@ -1174,7 +1175,7 @@
     240 /* output_channels_per_group */,
     40 /* input pixel stride */,
     240 /* output pixel stride */,
-    w162, w163,
+    w162.data(), w163.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op35);
@@ -1209,7 +1210,7 @@
     1 /* output_channels_per_group */,
     240 /* input pixel stride */,
     240 /* output pixel stride */,
-    w164, w165,
+    w164.data(), w165.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op37);
@@ -1244,7 +1245,7 @@
     80 /* output_channels_per_group */,
     240 /* input pixel stride */,
     80 /* output pixel stride */,
-    w166, w167,
+    w166.data(), w167.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op39);
@@ -1266,7 +1267,7 @@
     200 /* output_channels_per_group */,
     80 /* input pixel stride */,
     200 /* output pixel stride */,
-    w168, w169,
+    w168.data(), w169.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op40);
@@ -1301,7 +1302,7 @@
     1 /* output_channels_per_group */,
     200 /* input pixel stride */,
     200 /* output pixel stride */,
-    w170, w171,
+    w170.data(), w171.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op42);
@@ -1336,7 +1337,7 @@
     80 /* output_channels_per_group */,
     200 /* input pixel stride */,
     80 /* output pixel stride */,
-    w172, w173,
+    w172.data(), w173.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op44);
@@ -1369,7 +1370,7 @@
     184 /* output_channels_per_group */,
     80 /* input pixel stride */,
     184 /* output pixel stride */,
-    w174, w175,
+    w174.data(), w175.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op46);
@@ -1404,7 +1405,7 @@
     1 /* output_channels_per_group */,
     184 /* input pixel stride */,
     184 /* output pixel stride */,
-    w176, w177,
+    w176.data(), w177.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op48);
@@ -1439,7 +1440,7 @@
     80 /* output_channels_per_group */,
     184 /* input pixel stride */,
     80 /* output pixel stride */,
-    w178, w179,
+    w178.data(), w179.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op50);
@@ -1472,7 +1473,7 @@
     184 /* output_channels_per_group */,
     80 /* input pixel stride */,
     184 /* output pixel stride */,
-    w180, w181,
+    w180.data(), w181.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op52);
@@ -1507,7 +1508,7 @@
     1 /* output_channels_per_group */,
     184 /* input pixel stride */,
     184 /* output pixel stride */,
-    w182, w183,
+    w182.data(), w183.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op54);
@@ -1542,7 +1543,7 @@
     80 /* output_channels_per_group */,
     184 /* input pixel stride */,
     80 /* output pixel stride */,
-    w184, w185,
+    w184.data(), w185.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op56);
@@ -1575,7 +1576,7 @@
     480 /* output_channels_per_group */,
     80 /* input pixel stride */,
     480 /* output pixel stride */,
-    w186, w187,
+    w186.data(), w187.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op58);
@@ -1610,7 +1611,7 @@
     1 /* output_channels_per_group */,
     480 /* input pixel stride */,
     480 /* output pixel stride */,
-    w188, w189,
+    w188.data(), w189.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op60);
@@ -1657,7 +1658,7 @@
     120 /* output_channels_per_group */,
     480 /* input pixel stride */,
     120 /* output pixel stride */,
-    w190, w191,
+    w190.data(), w191.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op63);
@@ -1679,7 +1680,7 @@
     480 /* output_channels_per_group */,
     120 /* input pixel stride */,
     480 /* output pixel stride */,
-    w192, w193,
+    w192.data(), w193.data(),
     0.0f /* output min */, +0x1.00014Fp+0 /* output max */,
     0 /* flags */,
     &op64);
@@ -1712,7 +1713,7 @@
     112 /* output_channels_per_group */,
     480 /* input pixel stride */,
     112 /* output pixel stride */,
-    w194, w195,
+    w194.data(), w195.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op66);
@@ -1734,7 +1735,7 @@
     672 /* output_channels_per_group */,
     112 /* input pixel stride */,
     672 /* output pixel stride */,
-    w196, w197,
+    w196.data(), w197.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op67);
@@ -1769,7 +1770,7 @@
     1 /* output_channels_per_group */,
     672 /* input pixel stride */,
     672 /* output pixel stride */,
-    w198, w199,
+    w198.data(), w199.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op69);
@@ -1816,7 +1817,7 @@
     168 /* output_channels_per_group */,
     672 /* input pixel stride */,
     168 /* output pixel stride */,
-    w200, w201,
+    w200.data(), w201.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op72);
@@ -1838,7 +1839,7 @@
     672 /* output_channels_per_group */,
     168 /* input pixel stride */,
     672 /* output pixel stride */,
-    w202, w203,
+    w202.data(), w203.data(),
     0.0f /* output min */, +0x1.00014Fp+0 /* output max */,
     0 /* flags */,
     &op73);
@@ -1871,7 +1872,7 @@
     112 /* output_channels_per_group */,
     672 /* input pixel stride */,
     112 /* output pixel stride */,
-    w204, w205,
+    w204.data(), w205.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op75);
@@ -1904,7 +1905,7 @@
     672 /* output_channels_per_group */,
     112 /* input pixel stride */,
     672 /* output pixel stride */,
-    w206, w207,
+    w206.data(), w207.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op77);
@@ -1939,7 +1940,7 @@
     1 /* output_channels_per_group */,
     672 /* input pixel stride */,
     672 /* output pixel stride */,
-    w208, w209,
+    w208.data(), w209.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op79);
@@ -1986,7 +1987,7 @@
     168 /* output_channels_per_group */,
     672 /* input pixel stride */,
     168 /* output pixel stride */,
-    w210, w211,
+    w210.data(), w211.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op82);
@@ -2008,7 +2009,7 @@
     672 /* output_channels_per_group */,
     168 /* input pixel stride */,
     672 /* output pixel stride */,
-    w212, w213,
+    w212.data(), w213.data(),
     0.0f /* output min */, +0x1.00014Fp+0 /* output max */,
     0 /* flags */,
     &op83);
@@ -2041,7 +2042,7 @@
     160 /* output_channels_per_group */,
     672 /* input pixel stride */,
     160 /* output pixel stride */,
-    w214, w215,
+    w214.data(), w215.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op85);
@@ -2063,7 +2064,7 @@
     960 /* output_channels_per_group */,
     160 /* input pixel stride */,
     960 /* output pixel stride */,
-    w216, w217,
+    w216.data(), w217.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op86);
@@ -2098,7 +2099,7 @@
     1 /* output_channels_per_group */,
     960 /* input pixel stride */,
     960 /* output pixel stride */,
-    w218, w219,
+    w218.data(), w219.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op88);
@@ -2145,7 +2146,7 @@
     240 /* output_channels_per_group */,
     960 /* input pixel stride */,
     240 /* output pixel stride */,
-    w220, w221,
+    w220.data(), w221.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op91);
@@ -2167,7 +2168,7 @@
     960 /* output_channels_per_group */,
     240 /* input pixel stride */,
     960 /* output pixel stride */,
-    w222, w223,
+    w222.data(), w223.data(),
     0.0f /* output min */, +0x1.00014Fp+0 /* output max */,
     0 /* flags */,
     &op92);
@@ -2200,7 +2201,7 @@
     160 /* output_channels_per_group */,
     960 /* input pixel stride */,
     160 /* output pixel stride */,
-    w224, w225,
+    w224.data(), w225.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op94);
@@ -2233,7 +2234,7 @@
     960 /* output_channels_per_group */,
     160 /* input pixel stride */,
     960 /* output pixel stride */,
-    w226, w227,
+    w226.data(), w227.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op96);
@@ -2268,7 +2269,7 @@
     1 /* output_channels_per_group */,
     960 /* input pixel stride */,
     960 /* output pixel stride */,
-    w228, w229,
+    w228.data(), w229.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op98);
@@ -2315,7 +2316,7 @@
     240 /* output_channels_per_group */,
     960 /* input pixel stride */,
     240 /* output pixel stride */,
-    w230, w231,
+    w230.data(), w231.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op101);
@@ -2337,7 +2338,7 @@
     960 /* output_channels_per_group */,
     240 /* input pixel stride */,
     960 /* output pixel stride */,
-    w232, w233,
+    w232.data(), w233.data(),
     0.0f /* output min */, +0x1.00014Fp+0 /* output max */,
     0 /* flags */,
     &op102);
@@ -2370,7 +2371,7 @@
     160 /* output_channels_per_group */,
     960 /* input pixel stride */,
     160 /* output pixel stride */,
-    w234, w235,
+    w234.data(), w235.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op104);
@@ -2403,7 +2404,7 @@
     960 /* output_channels_per_group */,
     160 /* input pixel stride */,
     960 /* output pixel stride */,
-    w236, w237,
+    w236.data(), w237.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op106);
@@ -2450,7 +2451,7 @@
     1280 /* output_channels_per_group */,
     960 /* input pixel stride */,
     1280 /* output pixel stride */,
-    w238, w239,
+    w238.data(), w239.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op109);
@@ -2497,7 +2498,7 @@
     1001 /* output_channels_per_group */,
     1280 /* input pixel stride */,
     1001 /* output pixel stride */,
-    w240, w241,
+    w240.data(), w241.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op112);
@@ -2512,7 +2513,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op0,
     1 /* batch size */, 224 /* input height */, 224 /* input width */,
-    v0 /* input */, v1 /* output */,
+    v0.data() /* input */, v1.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #0" << std::endl;
@@ -2522,7 +2523,7 @@
   status = xnn_setup_hardswish_nc_f32(
     op1,
     12544 /* batch size */,
-    v1 /* input */, v2 /* output */,
+    v1.data() /* input */, v2.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #1" << std::endl;
@@ -2532,7 +2533,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op2,
     1 /* batch size */, 112 /* input height */, 112 /* input width */,
-    v2 /* input */, v3 /* output */,
+    v2.data() /* input */, v3.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #2" << std::endl;
@@ -2542,7 +2543,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op3,
     1 /* batch size */, 112 /* input height */, 112 /* input width */,
-    v3 /* input */, v4 /* output */,
+    v3.data() /* input */, v4.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #3" << std::endl;
@@ -2555,7 +2556,7 @@
     status = xnn_setup_add_nd_f32(
       op4,
       4, a_shape, 4, b_shape,
-      v4 /* a */, v2 /* b */, v5 /* output */,
+      v4.data() /* a */, v2.data() /* b */, v5.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2566,7 +2567,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op5,
     1 /* batch size */, 112 /* input height */, 112 /* input width */,
-    v5 /* input */, v6 /* output */,
+    v5.data() /* input */, v6.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #5" << std::endl;
@@ -2576,7 +2577,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op6,
     1 /* batch size */, 112 /* input height */, 112 /* input width */,
-    v6 /* input */, v7 /* output */,
+    v6.data() /* input */, v7.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #6" << std::endl;
@@ -2586,7 +2587,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op7,
     1 /* batch size */, 56 /* input height */, 56 /* input width */,
-    v7 /* input */, v8 /* output */,
+    v7.data() /* input */, v8.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #7" << std::endl;
@@ -2596,7 +2597,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op8,
     1 /* batch size */, 56 /* input height */, 56 /* input width */,
-    v8 /* input */, v9 /* output */,
+    v8.data() /* input */, v9.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #8" << std::endl;
@@ -2606,7 +2607,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op9,
     1 /* batch size */, 56 /* input height */, 56 /* input width */,
-    v9 /* input */, v10 /* output */,
+    v9.data() /* input */, v10.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #9" << std::endl;
@@ -2616,7 +2617,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op10,
     1 /* batch size */, 56 /* input height */, 56 /* input width */,
-    v10 /* input */, v11 /* output */,
+    v10.data() /* input */, v11.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #10" << std::endl;
@@ -2629,7 +2630,7 @@
     status = xnn_setup_add_nd_f32(
       op11,
       4, a_shape, 4, b_shape,
-      v11 /* a */, v8 /* b */, v12 /* output */,
+      v11.data() /* a */, v8.data() /* b */, v12.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2640,7 +2641,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op12,
     1 /* batch size */, 56 /* input height */, 56 /* input width */,
-    v12 /* input */, v13 /* output */,
+    v12.data() /* input */, v13.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #12" << std::endl;
@@ -2650,7 +2651,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op13,
     1 /* batch size */, 56 /* input height */, 56 /* input width */,
-    v13 /* input */, v14 /* output */,
+    v13.data() /* input */, v14.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #13" << std::endl;
@@ -2660,7 +2661,7 @@
   status = xnn_setup_global_average_pooling_nwc_f32(
     op14,
     1 /* batch size */, 784 /* width */,
-    v14 /* input */, v15 /* output */,
+    v14.data() /* input */, v15.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #14" << std::endl;
@@ -2670,7 +2671,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op15,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v15 /* input */, v16 /* output */,
+    v15.data() /* input */, v16.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #15" << std::endl;
@@ -2680,7 +2681,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op16,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v16 /* input */, v17 /* output */,
+    v16.data() /* input */, v17.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #16" << std::endl;
@@ -2693,7 +2694,7 @@
     status = xnn_setup_multiply_nd_f32(
       op17,
       4, a_shape, 4, b_shape,
-      v14 /* a */, v17 /* b */, v18 /* output */,
+      v14.data() /* a */, v17.data() /* b */, v18.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2704,7 +2705,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op18,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v18 /* input */, v19 /* output */,
+    v18.data() /* input */, v19.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #18" << std::endl;
@@ -2714,7 +2715,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op19,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v19 /* input */, v20 /* output */,
+    v19.data() /* input */, v20.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #19" << std::endl;
@@ -2724,7 +2725,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op20,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v20 /* input */, v21 /* output */,
+    v20.data() /* input */, v21.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #20" << std::endl;
@@ -2734,7 +2735,7 @@
   status = xnn_setup_global_average_pooling_nwc_f32(
     op21,
     1 /* batch size */, 784 /* width */,
-    v21 /* input */, v22 /* output */,
+    v21.data() /* input */, v22.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #21" << std::endl;
@@ -2744,7 +2745,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op22,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v22 /* input */, v23 /* output */,
+    v22.data() /* input */, v23.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #22" << std::endl;
@@ -2754,7 +2755,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op23,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v23 /* input */, v24 /* output */,
+    v23.data() /* input */, v24.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #23" << std::endl;
@@ -2767,7 +2768,7 @@
     status = xnn_setup_multiply_nd_f32(
       op24,
       4, a_shape, 4, b_shape,
-      v21 /* a */, v24 /* b */, v25 /* output */,
+      v21.data() /* a */, v24.data() /* b */, v25.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2778,7 +2779,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op25,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v25 /* input */, v26 /* output */,
+    v25.data() /* input */, v26.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #25" << std::endl;
@@ -2791,7 +2792,7 @@
     status = xnn_setup_add_nd_f32(
       op26,
       4, a_shape, 4, b_shape,
-      v26 /* a */, v19 /* b */, v27 /* output */,
+      v26.data() /* a */, v19.data() /* b */, v27.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2802,7 +2803,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op27,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v27 /* input */, v28 /* output */,
+    v27.data() /* input */, v28.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #27" << std::endl;
@@ -2812,7 +2813,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op28,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v28 /* input */, v29 /* output */,
+    v28.data() /* input */, v29.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #28" << std::endl;
@@ -2822,7 +2823,7 @@
   status = xnn_setup_global_average_pooling_nwc_f32(
     op29,
     1 /* batch size */, 784 /* width */,
-    v29 /* input */, v30 /* output */,
+    v29.data() /* input */, v30.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #29" << std::endl;
@@ -2832,7 +2833,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op30,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v30 /* input */, v31 /* output */,
+    v30.data() /* input */, v31.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #30" << std::endl;
@@ -2842,7 +2843,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op31,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v31 /* input */, v32 /* output */,
+    v31.data() /* input */, v32.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #31" << std::endl;
@@ -2855,7 +2856,7 @@
     status = xnn_setup_multiply_nd_f32(
       op32,
       4, a_shape, 4, b_shape,
-      v29 /* a */, v32 /* b */, v33 /* output */,
+      v29.data() /* a */, v32.data() /* b */, v33.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2866,7 +2867,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op33,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v33 /* input */, v34 /* output */,
+    v33.data() /* input */, v34.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #33" << std::endl;
@@ -2879,7 +2880,7 @@
     status = xnn_setup_add_nd_f32(
       op34,
       4, a_shape, 4, b_shape,
-      v34 /* a */, v27 /* b */, v35 /* output */,
+      v34.data() /* a */, v27.data() /* b */, v35.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2890,7 +2891,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op35,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v35 /* input */, v36 /* output */,
+    v35.data() /* input */, v36.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #35" << std::endl;
@@ -2900,7 +2901,7 @@
   status = xnn_setup_hardswish_nc_f32(
     op36,
     784 /* batch size */,
-    v36 /* input */, v37 /* output */,
+    v36.data() /* input */, v37.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #36" << std::endl;
@@ -2910,7 +2911,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op37,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v37 /* input */, v38 /* output */,
+    v37.data() /* input */, v38.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #37" << std::endl;
@@ -2920,7 +2921,7 @@
   status = xnn_setup_hardswish_nc_f32(
     op38,
     196 /* batch size */,
-    v38 /* input */, v39 /* output */,
+    v38.data() /* input */, v39.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #38" << std::endl;
@@ -2930,7 +2931,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op39,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v39 /* input */, v40 /* output */,
+    v39.data() /* input */, v40.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #39" << std::endl;
@@ -2940,7 +2941,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op40,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v40 /* input */, v41 /* output */,
+    v40.data() /* input */, v41.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #40" << std::endl;
@@ -2950,7 +2951,7 @@
   status = xnn_setup_hardswish_nc_f32(
     op41,
     196 /* batch size */,
-    v41 /* input */, v42 /* output */,
+    v41.data() /* input */, v42.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #41" << std::endl;
@@ -2960,7 +2961,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op42,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v42 /* input */, v43 /* output */,
+    v42.data() /* input */, v43.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #42" << std::endl;
@@ -2970,7 +2971,7 @@
   status = xnn_setup_hardswish_nc_f32(
     op43,
     196 /* batch size */,
-    v43 /* input */, v44 /* output */,
+    v43.data() /* input */, v44.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #43" << std::endl;
@@ -2980,7 +2981,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op44,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v44 /* input */, v45 /* output */,
+    v44.data() /* input */, v45.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #44" << std::endl;
@@ -2993,7 +2994,7 @@
     status = xnn_setup_add_nd_f32(
       op45,
       4, a_shape, 4, b_shape,
-      v45 /* a */, v40 /* b */, v46 /* output */,
+      v45.data() /* a */, v40.data() /* b */, v46.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -3004,7 +3005,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op46,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v46 /* input */, v47 /* output */,
+    v46.data() /* input */, v47.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #46" << std::endl;
@@ -3014,7 +3015,7 @@
   status = xnn_setup_hardswish_nc_f32(
     op47,
     196 /* batch size */,
-    v47 /* input */, v48 /* output */,
+    v47.data() /* input */, v48.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #47" << std::endl;
@@ -3024,7 +3025,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op48,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v48 /* input */, v49 /* output */,
+    v48.data() /* input */, v49.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #48" << std::endl;
@@ -3034,7 +3035,7 @@
   status = xnn_setup_hardswish_nc_f32(
     op49,
     196 /* batch size */,
-    v49 /* input */, v50 /* output */,
+    v49.data() /* input */, v50.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #49" << std::endl;
@@ -3044,7 +3045,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op50,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v50 /* input */, v51 /* output */,
+    v50.data() /* input */, v51.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #50" << std::endl;
@@ -3057,7 +3058,7 @@
     status = xnn_setup_add_nd_f32(
       op51,
       4, a_shape, 4, b_shape,
-      v51 /* a */, v46 /* b */, v52 /* output */,
+      v51.data() /* a */, v46.data() /* b */, v52.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -3068,7 +3069,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op52,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v52 /* input */, v53 /* output */,
+    v52.data() /* input */, v53.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #52" << std::endl;
@@ -3078,7 +3079,7 @@
   status = xnn_setup_hardswish_nc_f32(
     op53,
     196 /* batch size */,
-    v53 /* input */, v54 /* output */,
+    v53.data() /* input */, v54.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #53" << std::endl;
@@ -3088,7 +3089,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op54,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v54 /* input */, v55 /* output */,
+    v54.data() /* input */, v55.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #54" << std::endl;
@@ -3098,7 +3099,7 @@
   status = xnn_setup_hardswish_nc_f32(
     op55,
     196 /* batch size */,
-    v55 /* input */, v56 /* output */,
+    v55.data() /* input */, v56.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #55" << std::endl;
@@ -3108,7 +3109,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op56,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v56 /* input */, v57 /* output */,
+    v56.data() /* input */, v57.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #56" << std::endl;
@@ -3121,7 +3122,7 @@
     status = xnn_setup_add_nd_f32(
       op57,
       4, a_shape, 4, b_shape,
-      v57 /* a */, v52 /* b */, v58 /* output */,
+      v57.data() /* a */, v52.data() /* b */, v58.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -3132,7 +3133,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op58,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v58 /* input */, v59 /* output */,
+    v58.data() /* input */, v59.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #58" << std::endl;
@@ -3142,7 +3143,7 @@
   status = xnn_setup_hardswish_nc_f32(
     op59,
     196 /* batch size */,
-    v59 /* input */, v60 /* output */,
+    v59.data() /* input */, v60.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #59" << std::endl;
@@ -3152,7 +3153,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op60,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v60 /* input */, v61 /* output */,
+    v60.data() /* input */, v61.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #60" << std::endl;
@@ -3162,7 +3163,7 @@
   status = xnn_setup_hardswish_nc_f32(
     op61,
     196 /* batch size */,
-    v61 /* input */, v62 /* output */,
+    v61.data() /* input */, v62.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #61" << std::endl;
@@ -3172,7 +3173,7 @@
   status = xnn_setup_global_average_pooling_nwc_f32(
     op62,
     1 /* batch size */, 196 /* width */,
-    v62 /* input */, v63 /* output */,
+    v62.data() /* input */, v63.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #62" << std::endl;
@@ -3182,7 +3183,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op63,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v63 /* input */, v64 /* output */,
+    v63.data() /* input */, v64.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #63" << std::endl;
@@ -3192,7 +3193,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op64,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v64 /* input */, v65 /* output */,
+    v64.data() /* input */, v65.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #64" << std::endl;
@@ -3205,7 +3206,7 @@
     status = xnn_setup_multiply_nd_f32(
       op65,
       4, a_shape, 4, b_shape,
-      v62 /* a */, v65 /* b */, v66 /* output */,
+      v62.data() /* a */, v65.data() /* b */, v66.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -3216,7 +3217,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op66,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v66 /* input */, v67 /* output */,
+    v66.data() /* input */, v67.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #66" << std::endl;
@@ -3226,7 +3227,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op67,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v67 /* input */, v68 /* output */,
+    v67.data() /* input */, v68.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #67" << std::endl;
@@ -3236,7 +3237,7 @@
   status = xnn_setup_hardswish_nc_f32(
     op68,
     196 /* batch size */,
-    v68 /* input */, v69 /* output */,
+    v68.data() /* input */, v69.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #68" << std::endl;
@@ -3246,7 +3247,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op69,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v69 /* input */, v70 /* output */,
+    v69.data() /* input */, v70.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #69" << std::endl;
@@ -3256,7 +3257,7 @@
   status = xnn_setup_hardswish_nc_f32(
     op70,
     196 /* batch size */,
-    v70 /* input */, v71 /* output */,
+    v70.data() /* input */, v71.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #70" << std::endl;
@@ -3266,7 +3267,7 @@
   status = xnn_setup_global_average_pooling_nwc_f32(
     op71,
     1 /* batch size */, 196 /* width */,
-    v71 /* input */, v72 /* output */,
+    v71.data() /* input */, v72.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #71" << std::endl;
@@ -3276,7 +3277,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op72,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v72 /* input */, v73 /* output */,
+    v72.data() /* input */, v73.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #72" << std::endl;
@@ -3286,7 +3287,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op73,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v73 /* input */, v74 /* output */,
+    v73.data() /* input */, v74.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #73" << std::endl;
@@ -3299,7 +3300,7 @@
     status = xnn_setup_multiply_nd_f32(
       op74,
       4, a_shape, 4, b_shape,
-      v71 /* a */, v74 /* b */, v75 /* output */,
+      v71.data() /* a */, v74.data() /* b */, v75.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -3310,7 +3311,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op75,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v75 /* input */, v76 /* output */,
+    v75.data() /* input */, v76.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #75" << std::endl;
@@ -3323,7 +3324,7 @@
     status = xnn_setup_add_nd_f32(
       op76,
       4, a_shape, 4, b_shape,
-      v76 /* a */, v67 /* b */, v77 /* output */,
+      v76.data() /* a */, v67.data() /* b */, v77.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -3334,7 +3335,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op77,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v77 /* input */, v78 /* output */,
+    v77.data() /* input */, v78.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #77" << std::endl;
@@ -3344,7 +3345,7 @@
   status = xnn_setup_hardswish_nc_f32(
     op78,
     196 /* batch size */,
-    v78 /* input */, v79 /* output */,
+    v78.data() /* input */, v79.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #78" << std::endl;
@@ -3354,7 +3355,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op79,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v79 /* input */, v80 /* output */,
+    v79.data() /* input */, v80.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #79" << std::endl;
@@ -3364,7 +3365,7 @@
   status = xnn_setup_hardswish_nc_f32(
     op80,
     49 /* batch size */,
-    v80 /* input */, v81 /* output */,
+    v80.data() /* input */, v81.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #80" << std::endl;
@@ -3374,7 +3375,7 @@
   status = xnn_setup_global_average_pooling_nwc_f32(
     op81,
     1 /* batch size */, 49 /* width */,
-    v81 /* input */, v82 /* output */,
+    v81.data() /* input */, v82.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #81" << std::endl;
@@ -3384,7 +3385,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op82,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v82 /* input */, v83 /* output */,
+    v82.data() /* input */, v83.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #82" << std::endl;
@@ -3394,7 +3395,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op83,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v83 /* input */, v84 /* output */,
+    v83.data() /* input */, v84.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #83" << std::endl;
@@ -3407,7 +3408,7 @@
     status = xnn_setup_multiply_nd_f32(
       op84,
       4, a_shape, 4, b_shape,
-      v81 /* a */, v84 /* b */, v85 /* output */,
+      v81.data() /* a */, v84.data() /* b */, v85.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -3418,7 +3419,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op85,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v85 /* input */, v86 /* output */,
+    v85.data() /* input */, v86.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #85" << std::endl;
@@ -3428,7 +3429,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op86,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v86 /* input */, v87 /* output */,
+    v86.data() /* input */, v87.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #86" << std::endl;
@@ -3438,7 +3439,7 @@
   status = xnn_setup_hardswish_nc_f32(
     op87,
     49 /* batch size */,
-    v87 /* input */, v88 /* output */,
+    v87.data() /* input */, v88.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #87" << std::endl;
@@ -3448,7 +3449,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op88,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v88 /* input */, v89 /* output */,
+    v88.data() /* input */, v89.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #88" << std::endl;
@@ -3458,7 +3459,7 @@
   status = xnn_setup_hardswish_nc_f32(
     op89,
     49 /* batch size */,
-    v89 /* input */, v90 /* output */,
+    v89.data() /* input */, v90.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #89" << std::endl;
@@ -3468,7 +3469,7 @@
   status = xnn_setup_global_average_pooling_nwc_f32(
     op90,
     1 /* batch size */, 49 /* width */,
-    v90 /* input */, v91 /* output */,
+    v90.data() /* input */, v91.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #90" << std::endl;
@@ -3478,7 +3479,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op91,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v91 /* input */, v92 /* output */,
+    v91.data() /* input */, v92.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #91" << std::endl;
@@ -3488,7 +3489,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op92,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v92 /* input */, v93 /* output */,
+    v92.data() /* input */, v93.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #92" << std::endl;
@@ -3501,7 +3502,7 @@
     status = xnn_setup_multiply_nd_f32(
       op93,
       4, a_shape, 4, b_shape,
-      v90 /* a */, v93 /* b */, v94 /* output */,
+      v90.data() /* a */, v93.data() /* b */, v94.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -3512,7 +3513,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op94,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v94 /* input */, v95 /* output */,
+    v94.data() /* input */, v95.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #94" << std::endl;
@@ -3525,7 +3526,7 @@
     status = xnn_setup_add_nd_f32(
       op95,
       4, a_shape, 4, b_shape,
-      v95 /* a */, v86 /* b */, v96 /* output */,
+      v95.data() /* a */, v86.data() /* b */, v96.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -3536,7 +3537,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op96,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v96 /* input */, v97 /* output */,
+    v96.data() /* input */, v97.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #96" << std::endl;
@@ -3546,7 +3547,7 @@
   status = xnn_setup_hardswish_nc_f32(
     op97,
     49 /* batch size */,
-    v97 /* input */, v98 /* output */,
+    v97.data() /* input */, v98.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #97" << std::endl;
@@ -3556,7 +3557,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op98,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v98 /* input */, v99 /* output */,
+    v98.data() /* input */, v99.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #98" << std::endl;
@@ -3566,7 +3567,7 @@
   status = xnn_setup_hardswish_nc_f32(
     op99,
     49 /* batch size */,
-    v99 /* input */, v100 /* output */,
+    v99.data() /* input */, v100.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #99" << std::endl;
@@ -3576,7 +3577,7 @@
   status = xnn_setup_global_average_pooling_nwc_f32(
     op100,
     1 /* batch size */, 49 /* width */,
-    v100 /* input */, v101 /* output */,
+    v100.data() /* input */, v101.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #100" << std::endl;
@@ -3586,7 +3587,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op101,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v101 /* input */, v102 /* output */,
+    v101.data() /* input */, v102.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #101" << std::endl;
@@ -3596,7 +3597,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op102,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v102 /* input */, v103 /* output */,
+    v102.data() /* input */, v103.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #102" << std::endl;
@@ -3609,7 +3610,7 @@
     status = xnn_setup_multiply_nd_f32(
       op103,
       4, a_shape, 4, b_shape,
-      v100 /* a */, v103 /* b */, v104 /* output */,
+      v100.data() /* a */, v103.data() /* b */, v104.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -3620,7 +3621,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op104,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v104 /* input */, v105 /* output */,
+    v104.data() /* input */, v105.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #104" << std::endl;
@@ -3633,7 +3634,7 @@
     status = xnn_setup_add_nd_f32(
       op105,
       4, a_shape, 4, b_shape,
-      v105 /* a */, v96 /* b */, v106 /* output */,
+      v105.data() /* a */, v96.data() /* b */, v106.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -3644,7 +3645,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op106,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v106 /* input */, v107 /* output */,
+    v106.data() /* input */, v107.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #106" << std::endl;
@@ -3654,7 +3655,7 @@
   status = xnn_setup_hardswish_nc_f32(
     op107,
     49 /* batch size */,
-    v107 /* input */, v108 /* output */,
+    v107.data() /* input */, v108.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #107" << std::endl;
@@ -3664,7 +3665,7 @@
   status = xnn_setup_global_average_pooling_nwc_f32(
     op108,
     1 /* batch size */, 49 /* width */,
-    v108 /* input */, v109 /* output */,
+    v108.data() /* input */, v109.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #108" << std::endl;
@@ -3674,7 +3675,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op109,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v109 /* input */, v110 /* output */,
+    v109.data() /* input */, v110.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #109" << std::endl;
@@ -3684,7 +3685,7 @@
   status = xnn_setup_hardswish_nc_f32(
     op110,
     1 /* batch size */,
-    v110 /* input */, v111 /* output */,
+    v110.data() /* input */, v111.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #110" << std::endl;
@@ -3694,7 +3695,7 @@
   status = xnn_setup_global_average_pooling_nwc_f32(
     op111,
     1 /* batch size */, 1 /* width */,
-    v111 /* input */, v112 /* output */,
+    v111.data() /* input */, v112.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #111" << std::endl;
@@ -3704,7 +3705,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op112,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v112 /* input */, v113 /* output */,
+    v112.data() /* input */, v113.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #112" << std::endl;
diff --git a/models/fp32-mobilenet-v3-small.cc b/models/fp32-mobilenet-v3-small.cc
index d7bb0a4..4af3e48 100644
--- a/models/fp32-mobilenet-v3-small.cc
+++ b/models/fp32-mobilenet-v3-small.cc
@@ -5,6 +5,7 @@
 
 #include <xnnpack.h>
 
+#include <array>
 #include <algorithm>
 #include <functional>
 #include <iostream>
@@ -16,426 +17,426 @@
 namespace models {
 
 ExecutionPlan FP32MobileNetV3Small(pthreadpool_t threadpool) {
-  alignas(16) static float v0[150528];
-  alignas(16) static float v1[200704];
-  alignas(16) static float v2[200704];
-  alignas(16) static float v3[50176];
-  alignas(16) static float v4[16];
-  alignas(16) static float v5[8];
-  alignas(16) static float v6[16];
-  alignas(16) static float v7[50176];
-  alignas(16) static float v8[50176];
-  alignas(16) static float v9[225792];
-  alignas(16) static float v10[56448];
-  alignas(16) static float v11[18816];
-  alignas(16) static float v12[68992];
-  alignas(16) static float v13[68992];
-  alignas(16) static float v14[18816];
-  alignas(16) static float v15[18816];
-  alignas(16) static float v16[75264];
-  alignas(16) static float v17[75264];
-  alignas(16) static float v18[18816];
-  alignas(16) static float v19[18816];
-  alignas(16) static float v20[96];
-  alignas(16) static float v21[24];
-  alignas(16) static float v22[96];
-  alignas(16) static float v23[18816];
-  alignas(16) static float v24[7840];
-  alignas(16) static float v25[47040];
-  alignas(16) static float v26[47040];
-  alignas(16) static float v27[47040];
-  alignas(16) static float v28[47040];
-  alignas(16) static float v29[240];
-  alignas(16) static float v30[64];
-  alignas(16) static float v31[240];
-  alignas(16) static float v32[47040];
-  alignas(16) static float v33[7840];
-  alignas(16) static float v34[7840];
-  alignas(16) static float v35[47040];
-  alignas(16) static float v36[47040];
-  alignas(16) static float v37[47040];
-  alignas(16) static float v38[47040];
-  alignas(16) static float v39[240];
-  alignas(16) static float v40[64];
-  alignas(16) static float v41[240];
-  alignas(16) static float v42[47040];
-  alignas(16) static float v43[7840];
-  alignas(16) static float v44[7840];
-  alignas(16) static float v45[23520];
-  alignas(16) static float v46[23520];
-  alignas(16) static float v47[23520];
-  alignas(16) static float v48[23520];
-  alignas(16) static float v49[120];
-  alignas(16) static float v50[32];
-  alignas(16) static float v51[120];
-  alignas(16) static float v52[23520];
-  alignas(16) static float v53[9408];
-  alignas(16) static float v54[28224];
-  alignas(16) static float v55[28224];
-  alignas(16) static float v56[28224];
-  alignas(16) static float v57[28224];
-  alignas(16) static float v58[144];
-  alignas(16) static float v59[40];
-  alignas(16) static float v60[144];
-  alignas(16) static float v61[28224];
-  alignas(16) static float v62[9408];
-  alignas(16) static float v63[9408];
-  alignas(16) static float v64[56448];
-  alignas(16) static float v65[56448];
-  alignas(16) static float v66[14112];
-  alignas(16) static float v67[14112];
-  alignas(16) static float v68[288];
-  alignas(16) static float v69[72];
-  alignas(16) static float v70[288];
-  alignas(16) static float v71[14112];
-  alignas(16) static float v72[4704];
-  alignas(16) static float v73[28224];
-  alignas(16) static float v74[28224];
-  alignas(16) static float v75[28224];
-  alignas(16) static float v76[28224];
-  alignas(16) static float v77[576];
-  alignas(16) static float v78[144];
-  alignas(16) static float v79[576];
-  alignas(16) static float v80[28224];
-  alignas(16) static float v81[4704];
-  alignas(16) static float v82[4704];
-  alignas(16) static float v83[28224];
-  alignas(16) static float v84[28224];
-  alignas(16) static float v85[28224];
-  alignas(16) static float v86[28224];
-  alignas(16) static float v87[576];
-  alignas(16) static float v88[144];
-  alignas(16) static float v89[576];
-  alignas(16) static float v90[28224];
-  alignas(16) static float v91[4704];
-  alignas(16) static float v92[4704];
-  alignas(16) static float v93[28224];
-  alignas(16) static float v94[28224];
-  alignas(16) static float v95[576];
-  alignas(16) static float v96[1024];
-  alignas(16) static float v97[1024];
-  alignas(16) static float v98[1024];
-  alignas(16) static float v99[1001];
-  alignas(16) static float w100[432];
-  alignas(16) static float w101[16];
-  alignas(16) static float w102[144];
-  alignas(16) static float w103[16];
-  alignas(16) static float w104[128];
-  alignas(16) static float w105[8];
-  alignas(16) static float w106[128];
-  alignas(16) static float w107[16];
-  alignas(16) static float w108[256];
-  alignas(16) static float w109[16];
-  alignas(16) static float w110[1152];
-  alignas(16) static float w111[72];
-  alignas(16) static float w112[648];
-  alignas(16) static float w113[72];
-  alignas(16) static float w114[1728];
-  alignas(16) static float w115[24];
-  alignas(16) static float w116[2112];
-  alignas(16) static float w117[88];
-  alignas(16) static float w118[792];
-  alignas(16) static float w119[88];
-  alignas(16) static float w120[2112];
-  alignas(16) static float w121[24];
-  alignas(16) static float w122[2304];
-  alignas(16) static float w123[96];
-  alignas(16) static float w124[2400];
-  alignas(16) static float w125[96];
-  alignas(16) static float w126[2304];
-  alignas(16) static float w127[24];
-  alignas(16) static float w128[2304];
-  alignas(16) static float w129[96];
-  alignas(16) static float w130[3840];
-  alignas(16) static float w131[40];
-  alignas(16) static float w132[9600];
-  alignas(16) static float w133[240];
-  alignas(16) static float w134[6000];
-  alignas(16) static float w135[240];
-  alignas(16) static float w136[15360];
-  alignas(16) static float w137[64];
-  alignas(16) static float w138[15360];
-  alignas(16) static float w139[240];
-  alignas(16) static float w140[9600];
-  alignas(16) static float w141[40];
-  alignas(16) static float w142[9600];
-  alignas(16) static float w143[240];
-  alignas(16) static float w144[6000];
-  alignas(16) static float w145[240];
-  alignas(16) static float w146[15360];
-  alignas(16) static float w147[64];
-  alignas(16) static float w148[15360];
-  alignas(16) static float w149[240];
-  alignas(16) static float w150[9600];
-  alignas(16) static float w151[40];
-  alignas(16) static float w152[4800];
-  alignas(16) static float w153[120];
-  alignas(16) static float w154[3000];
-  alignas(16) static float w155[120];
-  alignas(16) static float w156[3840];
-  alignas(16) static float w157[32];
-  alignas(16) static float w158[3840];
-  alignas(16) static float w159[120];
-  alignas(16) static float w160[5760];
-  alignas(16) static float w161[48];
-  alignas(16) static float w162[6912];
-  alignas(16) static float w163[144];
-  alignas(16) static float w164[3600];
-  alignas(16) static float w165[144];
-  alignas(16) static float w166[5760];
-  alignas(16) static float w167[40];
-  alignas(16) static float w168[5760];
-  alignas(16) static float w169[144];
-  alignas(16) static float w170[6912];
-  alignas(16) static float w171[48];
-  alignas(16) static float w172[13824];
-  alignas(16) static float w173[288];
-  alignas(16) static float w174[7200];
-  alignas(16) static float w175[288];
-  alignas(16) static float w176[20736];
-  alignas(16) static float w177[72];
-  alignas(16) static float w178[20736];
-  alignas(16) static float w179[288];
-  alignas(16) static float w180[27648];
-  alignas(16) static float w181[96];
-  alignas(16) static float w182[55296];
-  alignas(16) static float w183[576];
-  alignas(16) static float w184[14400];
-  alignas(16) static float w185[576];
-  alignas(16) static float w186[82944];
-  alignas(16) static float w187[144];
-  alignas(16) static float w188[82944];
-  alignas(16) static float w189[576];
-  alignas(16) static float w190[55296];
-  alignas(16) static float w191[96];
-  alignas(16) static float w192[55296];
-  alignas(16) static float w193[576];
-  alignas(16) static float w194[14400];
-  alignas(16) static float w195[576];
-  alignas(16) static float w196[82944];
-  alignas(16) static float w197[144];
-  alignas(16) static float w198[82944];
-  alignas(16) static float w199[576];
-  alignas(16) static float w200[55296];
-  alignas(16) static float w201[96];
-  alignas(16) static float w202[55296];
-  alignas(16) static float w203[576];
-  alignas(16) static float w204[589824];
-  alignas(16) static float w205[1024];
-  alignas(16) static float w206[1025024];
-  alignas(16) static float w207[1001];
+  alignas(16) static std::array<float, 150528> v0;
+  alignas(16) static std::array<float, 200704> v1;
+  alignas(16) static std::array<float, 200704> v2;
+  alignas(16) static std::array<float, 50176> v3;
+  alignas(16) static std::array<float, 16> v4;
+  alignas(16) static std::array<float, 8> v5;
+  alignas(16) static std::array<float, 16> v6;
+  alignas(16) static std::array<float, 50176> v7;
+  alignas(16) static std::array<float, 50176> v8;
+  alignas(16) static std::array<float, 225792> v9;
+  alignas(16) static std::array<float, 56448> v10;
+  alignas(16) static std::array<float, 18816> v11;
+  alignas(16) static std::array<float, 68992> v12;
+  alignas(16) static std::array<float, 68992> v13;
+  alignas(16) static std::array<float, 18816> v14;
+  alignas(16) static std::array<float, 18816> v15;
+  alignas(16) static std::array<float, 75264> v16;
+  alignas(16) static std::array<float, 75264> v17;
+  alignas(16) static std::array<float, 18816> v18;
+  alignas(16) static std::array<float, 18816> v19;
+  alignas(16) static std::array<float, 96> v20;
+  alignas(16) static std::array<float, 24> v21;
+  alignas(16) static std::array<float, 96> v22;
+  alignas(16) static std::array<float, 18816> v23;
+  alignas(16) static std::array<float, 7840> v24;
+  alignas(16) static std::array<float, 47040> v25;
+  alignas(16) static std::array<float, 47040> v26;
+  alignas(16) static std::array<float, 47040> v27;
+  alignas(16) static std::array<float, 47040> v28;
+  alignas(16) static std::array<float, 240> v29;
+  alignas(16) static std::array<float, 64> v30;
+  alignas(16) static std::array<float, 240> v31;
+  alignas(16) static std::array<float, 47040> v32;
+  alignas(16) static std::array<float, 7840> v33;
+  alignas(16) static std::array<float, 7840> v34;
+  alignas(16) static std::array<float, 47040> v35;
+  alignas(16) static std::array<float, 47040> v36;
+  alignas(16) static std::array<float, 47040> v37;
+  alignas(16) static std::array<float, 47040> v38;
+  alignas(16) static std::array<float, 240> v39;
+  alignas(16) static std::array<float, 64> v40;
+  alignas(16) static std::array<float, 240> v41;
+  alignas(16) static std::array<float, 47040> v42;
+  alignas(16) static std::array<float, 7840> v43;
+  alignas(16) static std::array<float, 7840> v44;
+  alignas(16) static std::array<float, 23520> v45;
+  alignas(16) static std::array<float, 23520> v46;
+  alignas(16) static std::array<float, 23520> v47;
+  alignas(16) static std::array<float, 23520> v48;
+  alignas(16) static std::array<float, 120> v49;
+  alignas(16) static std::array<float, 32> v50;
+  alignas(16) static std::array<float, 120> v51;
+  alignas(16) static std::array<float, 23520> v52;
+  alignas(16) static std::array<float, 9408> v53;
+  alignas(16) static std::array<float, 28224> v54;
+  alignas(16) static std::array<float, 28224> v55;
+  alignas(16) static std::array<float, 28224> v56;
+  alignas(16) static std::array<float, 28224> v57;
+  alignas(16) static std::array<float, 144> v58;
+  alignas(16) static std::array<float, 40> v59;
+  alignas(16) static std::array<float, 144> v60;
+  alignas(16) static std::array<float, 28224> v61;
+  alignas(16) static std::array<float, 9408> v62;
+  alignas(16) static std::array<float, 9408> v63;
+  alignas(16) static std::array<float, 56448> v64;
+  alignas(16) static std::array<float, 56448> v65;
+  alignas(16) static std::array<float, 14112> v66;
+  alignas(16) static std::array<float, 14112> v67;
+  alignas(16) static std::array<float, 288> v68;
+  alignas(16) static std::array<float, 72> v69;
+  alignas(16) static std::array<float, 288> v70;
+  alignas(16) static std::array<float, 14112> v71;
+  alignas(16) static std::array<float, 4704> v72;
+  alignas(16) static std::array<float, 28224> v73;
+  alignas(16) static std::array<float, 28224> v74;
+  alignas(16) static std::array<float, 28224> v75;
+  alignas(16) static std::array<float, 28224> v76;
+  alignas(16) static std::array<float, 576> v77;
+  alignas(16) static std::array<float, 144> v78;
+  alignas(16) static std::array<float, 576> v79;
+  alignas(16) static std::array<float, 28224> v80;
+  alignas(16) static std::array<float, 4704> v81;
+  alignas(16) static std::array<float, 4704> v82;
+  alignas(16) static std::array<float, 28224> v83;
+  alignas(16) static std::array<float, 28224> v84;
+  alignas(16) static std::array<float, 28224> v85;
+  alignas(16) static std::array<float, 28224> v86;
+  alignas(16) static std::array<float, 576> v87;
+  alignas(16) static std::array<float, 144> v88;
+  alignas(16) static std::array<float, 576> v89;
+  alignas(16) static std::array<float, 28224> v90;
+  alignas(16) static std::array<float, 4704> v91;
+  alignas(16) static std::array<float, 4704> v92;
+  alignas(16) static std::array<float, 28224> v93;
+  alignas(16) static std::array<float, 28224> v94;
+  alignas(16) static std::array<float, 576> v95;
+  alignas(16) static std::array<float, 1024> v96;
+  alignas(16) static std::array<float, 1024> v97;
+  alignas(16) static std::array<float, 1024> v98;
+  alignas(16) static std::array<float, 1001> v99;
+  alignas(16) static std::array<float, 432> w100;
+  alignas(16) static std::array<float, 16> w101;
+  alignas(16) static std::array<float, 144> w102;
+  alignas(16) static std::array<float, 16> w103;
+  alignas(16) static std::array<float, 128> w104;
+  alignas(16) static std::array<float, 8> w105;
+  alignas(16) static std::array<float, 128> w106;
+  alignas(16) static std::array<float, 16> w107;
+  alignas(16) static std::array<float, 256> w108;
+  alignas(16) static std::array<float, 16> w109;
+  alignas(16) static std::array<float, 1152> w110;
+  alignas(16) static std::array<float, 72> w111;
+  alignas(16) static std::array<float, 648> w112;
+  alignas(16) static std::array<float, 72> w113;
+  alignas(16) static std::array<float, 1728> w114;
+  alignas(16) static std::array<float, 24> w115;
+  alignas(16) static std::array<float, 2112> w116;
+  alignas(16) static std::array<float, 88> w117;
+  alignas(16) static std::array<float, 792> w118;
+  alignas(16) static std::array<float, 88> w119;
+  alignas(16) static std::array<float, 2112> w120;
+  alignas(16) static std::array<float, 24> w121;
+  alignas(16) static std::array<float, 2304> w122;
+  alignas(16) static std::array<float, 96> w123;
+  alignas(16) static std::array<float, 2400> w124;
+  alignas(16) static std::array<float, 96> w125;
+  alignas(16) static std::array<float, 2304> w126;
+  alignas(16) static std::array<float, 24> w127;
+  alignas(16) static std::array<float, 2304> w128;
+  alignas(16) static std::array<float, 96> w129;
+  alignas(16) static std::array<float, 3840> w130;
+  alignas(16) static std::array<float, 40> w131;
+  alignas(16) static std::array<float, 9600> w132;
+  alignas(16) static std::array<float, 240> w133;
+  alignas(16) static std::array<float, 6000> w134;
+  alignas(16) static std::array<float, 240> w135;
+  alignas(16) static std::array<float, 15360> w136;
+  alignas(16) static std::array<float, 64> w137;
+  alignas(16) static std::array<float, 15360> w138;
+  alignas(16) static std::array<float, 240> w139;
+  alignas(16) static std::array<float, 9600> w140;
+  alignas(16) static std::array<float, 40> w141;
+  alignas(16) static std::array<float, 9600> w142;
+  alignas(16) static std::array<float, 240> w143;
+  alignas(16) static std::array<float, 6000> w144;
+  alignas(16) static std::array<float, 240> w145;
+  alignas(16) static std::array<float, 15360> w146;
+  alignas(16) static std::array<float, 64> w147;
+  alignas(16) static std::array<float, 15360> w148;
+  alignas(16) static std::array<float, 240> w149;
+  alignas(16) static std::array<float, 9600> w150;
+  alignas(16) static std::array<float, 40> w151;
+  alignas(16) static std::array<float, 4800> w152;
+  alignas(16) static std::array<float, 120> w153;
+  alignas(16) static std::array<float, 3000> w154;
+  alignas(16) static std::array<float, 120> w155;
+  alignas(16) static std::array<float, 3840> w156;
+  alignas(16) static std::array<float, 32> w157;
+  alignas(16) static std::array<float, 3840> w158;
+  alignas(16) static std::array<float, 120> w159;
+  alignas(16) static std::array<float, 5760> w160;
+  alignas(16) static std::array<float, 48> w161;
+  alignas(16) static std::array<float, 6912> w162;
+  alignas(16) static std::array<float, 144> w163;
+  alignas(16) static std::array<float, 3600> w164;
+  alignas(16) static std::array<float, 144> w165;
+  alignas(16) static std::array<float, 5760> w166;
+  alignas(16) static std::array<float, 40> w167;
+  alignas(16) static std::array<float, 5760> w168;
+  alignas(16) static std::array<float, 144> w169;
+  alignas(16) static std::array<float, 6912> w170;
+  alignas(16) static std::array<float, 48> w171;
+  alignas(16) static std::array<float, 13824> w172;
+  alignas(16) static std::array<float, 288> w173;
+  alignas(16) static std::array<float, 7200> w174;
+  alignas(16) static std::array<float, 288> w175;
+  alignas(16) static std::array<float, 20736> w176;
+  alignas(16) static std::array<float, 72> w177;
+  alignas(16) static std::array<float, 20736> w178;
+  alignas(16) static std::array<float, 288> w179;
+  alignas(16) static std::array<float, 27648> w180;
+  alignas(16) static std::array<float, 96> w181;
+  alignas(16) static std::array<float, 55296> w182;
+  alignas(16) static std::array<float, 576> w183;
+  alignas(16) static std::array<float, 14400> w184;
+  alignas(16) static std::array<float, 576> w185;
+  alignas(16) static std::array<float, 82944> w186;
+  alignas(16) static std::array<float, 144> w187;
+  alignas(16) static std::array<float, 82944> w188;
+  alignas(16) static std::array<float, 576> w189;
+  alignas(16) static std::array<float, 55296> w190;
+  alignas(16) static std::array<float, 96> w191;
+  alignas(16) static std::array<float, 55296> w192;
+  alignas(16) static std::array<float, 576> w193;
+  alignas(16) static std::array<float, 14400> w194;
+  alignas(16) static std::array<float, 576> w195;
+  alignas(16) static std::array<float, 82944> w196;
+  alignas(16) static std::array<float, 144> w197;
+  alignas(16) static std::array<float, 82944> w198;
+  alignas(16) static std::array<float, 576> w199;
+  alignas(16) static std::array<float, 55296> w200;
+  alignas(16) static std::array<float, 96> w201;
+  alignas(16) static std::array<float, 55296> w202;
+  alignas(16) static std::array<float, 576> w203;
+  alignas(16) static std::array<float, 589824> w204;
+  alignas(16) static std::array<float, 1024> w205;
+  alignas(16) static std::array<float, 1025024> w206;
+  alignas(16) static std::array<float, 1001> w207;
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.0f, +1.0f), std::ref(rng));
-  std::generate(v0, v0 + 150528, std::ref(f32rng));
-  std::generate(v1, v1 + 200704, std::ref(f32rng));
-  std::generate(v2, v2 + 200704, std::ref(f32rng));
-  std::generate(v3, v3 + 50176, std::ref(f32rng));
-  std::generate(v4, v4 + 16, std::ref(f32rng));
-  std::generate(v5, v5 + 8, std::ref(f32rng));
-  std::generate(v6, v6 + 16, std::ref(f32rng));
-  std::generate(v7, v7 + 50176, std::ref(f32rng));
-  std::generate(v8, v8 + 50176, std::ref(f32rng));
-  std::generate(v9, v9 + 225792, std::ref(f32rng));
-  std::generate(v10, v10 + 56448, std::ref(f32rng));
-  std::generate(v11, v11 + 18816, std::ref(f32rng));
-  std::generate(v12, v12 + 68992, std::ref(f32rng));
-  std::generate(v13, v13 + 68992, std::ref(f32rng));
-  std::generate(v14, v14 + 18816, std::ref(f32rng));
-  std::generate(v15, v15 + 18816, std::ref(f32rng));
-  std::generate(v16, v16 + 75264, std::ref(f32rng));
-  std::generate(v17, v17 + 75264, std::ref(f32rng));
-  std::generate(v18, v18 + 18816, std::ref(f32rng));
-  std::generate(v19, v19 + 18816, std::ref(f32rng));
-  std::generate(v20, v20 + 96, std::ref(f32rng));
-  std::generate(v21, v21 + 24, std::ref(f32rng));
-  std::generate(v22, v22 + 96, std::ref(f32rng));
-  std::generate(v23, v23 + 18816, std::ref(f32rng));
-  std::generate(v24, v24 + 7840, std::ref(f32rng));
-  std::generate(v25, v25 + 47040, std::ref(f32rng));
-  std::generate(v26, v26 + 47040, std::ref(f32rng));
-  std::generate(v27, v27 + 47040, std::ref(f32rng));
-  std::generate(v28, v28 + 47040, std::ref(f32rng));
-  std::generate(v29, v29 + 240, std::ref(f32rng));
-  std::generate(v30, v30 + 64, std::ref(f32rng));
-  std::generate(v31, v31 + 240, std::ref(f32rng));
-  std::generate(v32, v32 + 47040, std::ref(f32rng));
-  std::generate(v33, v33 + 7840, std::ref(f32rng));
-  std::generate(v34, v34 + 7840, std::ref(f32rng));
-  std::generate(v35, v35 + 47040, std::ref(f32rng));
-  std::generate(v36, v36 + 47040, std::ref(f32rng));
-  std::generate(v37, v37 + 47040, std::ref(f32rng));
-  std::generate(v38, v38 + 47040, std::ref(f32rng));
-  std::generate(v39, v39 + 240, std::ref(f32rng));
-  std::generate(v40, v40 + 64, std::ref(f32rng));
-  std::generate(v41, v41 + 240, std::ref(f32rng));
-  std::generate(v42, v42 + 47040, std::ref(f32rng));
-  std::generate(v43, v43 + 7840, std::ref(f32rng));
-  std::generate(v44, v44 + 7840, std::ref(f32rng));
-  std::generate(v45, v45 + 23520, std::ref(f32rng));
-  std::generate(v46, v46 + 23520, std::ref(f32rng));
-  std::generate(v47, v47 + 23520, std::ref(f32rng));
-  std::generate(v48, v48 + 23520, std::ref(f32rng));
-  std::generate(v49, v49 + 120, std::ref(f32rng));
-  std::generate(v50, v50 + 32, std::ref(f32rng));
-  std::generate(v51, v51 + 120, std::ref(f32rng));
-  std::generate(v52, v52 + 23520, std::ref(f32rng));
-  std::generate(v53, v53 + 9408, std::ref(f32rng));
-  std::generate(v54, v54 + 28224, std::ref(f32rng));
-  std::generate(v55, v55 + 28224, std::ref(f32rng));
-  std::generate(v56, v56 + 28224, std::ref(f32rng));
-  std::generate(v57, v57 + 28224, std::ref(f32rng));
-  std::generate(v58, v58 + 144, std::ref(f32rng));
-  std::generate(v59, v59 + 40, std::ref(f32rng));
-  std::generate(v60, v60 + 144, std::ref(f32rng));
-  std::generate(v61, v61 + 28224, std::ref(f32rng));
-  std::generate(v62, v62 + 9408, std::ref(f32rng));
-  std::generate(v63, v63 + 9408, std::ref(f32rng));
-  std::generate(v64, v64 + 56448, std::ref(f32rng));
-  std::generate(v65, v65 + 56448, std::ref(f32rng));
-  std::generate(v66, v66 + 14112, std::ref(f32rng));
-  std::generate(v67, v67 + 14112, std::ref(f32rng));
-  std::generate(v68, v68 + 288, std::ref(f32rng));
-  std::generate(v69, v69 + 72, std::ref(f32rng));
-  std::generate(v70, v70 + 288, std::ref(f32rng));
-  std::generate(v71, v71 + 14112, std::ref(f32rng));
-  std::generate(v72, v72 + 4704, std::ref(f32rng));
-  std::generate(v73, v73 + 28224, std::ref(f32rng));
-  std::generate(v74, v74 + 28224, std::ref(f32rng));
-  std::generate(v75, v75 + 28224, std::ref(f32rng));
-  std::generate(v76, v76 + 28224, std::ref(f32rng));
-  std::generate(v77, v77 + 576, std::ref(f32rng));
-  std::generate(v78, v78 + 144, std::ref(f32rng));
-  std::generate(v79, v79 + 576, std::ref(f32rng));
-  std::generate(v80, v80 + 28224, std::ref(f32rng));
-  std::generate(v81, v81 + 4704, std::ref(f32rng));
-  std::generate(v82, v82 + 4704, std::ref(f32rng));
-  std::generate(v83, v83 + 28224, std::ref(f32rng));
-  std::generate(v84, v84 + 28224, std::ref(f32rng));
-  std::generate(v85, v85 + 28224, std::ref(f32rng));
-  std::generate(v86, v86 + 28224, std::ref(f32rng));
-  std::generate(v87, v87 + 576, std::ref(f32rng));
-  std::generate(v88, v88 + 144, std::ref(f32rng));
-  std::generate(v89, v89 + 576, std::ref(f32rng));
-  std::generate(v90, v90 + 28224, std::ref(f32rng));
-  std::generate(v91, v91 + 4704, std::ref(f32rng));
-  std::generate(v92, v92 + 4704, std::ref(f32rng));
-  std::generate(v93, v93 + 28224, std::ref(f32rng));
-  std::generate(v94, v94 + 28224, std::ref(f32rng));
-  std::generate(v95, v95 + 576, std::ref(f32rng));
-  std::generate(v96, v96 + 1024, std::ref(f32rng));
-  std::generate(v97, v97 + 1024, std::ref(f32rng));
-  std::generate(v98, v98 + 1024, std::ref(f32rng));
-  std::generate(v99, v99 + 1001, std::ref(f32rng));
-  std::generate(w100, w100 + 432, std::ref(f32rng));
-  std::generate(w101, w101 + 16, std::ref(f32rng));
-  std::generate(w102, w102 + 144, std::ref(f32rng));
-  std::generate(w103, w103 + 16, std::ref(f32rng));
-  std::generate(w104, w104 + 128, std::ref(f32rng));
-  std::generate(w105, w105 + 8, std::ref(f32rng));
-  std::generate(w106, w106 + 128, std::ref(f32rng));
-  std::generate(w107, w107 + 16, std::ref(f32rng));
-  std::generate(w108, w108 + 256, std::ref(f32rng));
-  std::generate(w109, w109 + 16, std::ref(f32rng));
-  std::generate(w110, w110 + 1152, std::ref(f32rng));
-  std::generate(w111, w111 + 72, std::ref(f32rng));
-  std::generate(w112, w112 + 648, std::ref(f32rng));
-  std::generate(w113, w113 + 72, std::ref(f32rng));
-  std::generate(w114, w114 + 1728, std::ref(f32rng));
-  std::generate(w115, w115 + 24, std::ref(f32rng));
-  std::generate(w116, w116 + 2112, std::ref(f32rng));
-  std::generate(w117, w117 + 88, std::ref(f32rng));
-  std::generate(w118, w118 + 792, std::ref(f32rng));
-  std::generate(w119, w119 + 88, std::ref(f32rng));
-  std::generate(w120, w120 + 2112, std::ref(f32rng));
-  std::generate(w121, w121 + 24, std::ref(f32rng));
-  std::generate(w122, w122 + 2304, std::ref(f32rng));
-  std::generate(w123, w123 + 96, std::ref(f32rng));
-  std::generate(w124, w124 + 2400, std::ref(f32rng));
-  std::generate(w125, w125 + 96, std::ref(f32rng));
-  std::generate(w126, w126 + 2304, std::ref(f32rng));
-  std::generate(w127, w127 + 24, std::ref(f32rng));
-  std::generate(w128, w128 + 2304, std::ref(f32rng));
-  std::generate(w129, w129 + 96, std::ref(f32rng));
-  std::generate(w130, w130 + 3840, std::ref(f32rng));
-  std::generate(w131, w131 + 40, std::ref(f32rng));
-  std::generate(w132, w132 + 9600, std::ref(f32rng));
-  std::generate(w133, w133 + 240, std::ref(f32rng));
-  std::generate(w134, w134 + 6000, std::ref(f32rng));
-  std::generate(w135, w135 + 240, std::ref(f32rng));
-  std::generate(w136, w136 + 15360, std::ref(f32rng));
-  std::generate(w137, w137 + 64, std::ref(f32rng));
-  std::generate(w138, w138 + 15360, std::ref(f32rng));
-  std::generate(w139, w139 + 240, std::ref(f32rng));
-  std::generate(w140, w140 + 9600, std::ref(f32rng));
-  std::generate(w141, w141 + 40, std::ref(f32rng));
-  std::generate(w142, w142 + 9600, std::ref(f32rng));
-  std::generate(w143, w143 + 240, std::ref(f32rng));
-  std::generate(w144, w144 + 6000, std::ref(f32rng));
-  std::generate(w145, w145 + 240, std::ref(f32rng));
-  std::generate(w146, w146 + 15360, std::ref(f32rng));
-  std::generate(w147, w147 + 64, std::ref(f32rng));
-  std::generate(w148, w148 + 15360, std::ref(f32rng));
-  std::generate(w149, w149 + 240, std::ref(f32rng));
-  std::generate(w150, w150 + 9600, std::ref(f32rng));
-  std::generate(w151, w151 + 40, std::ref(f32rng));
-  std::generate(w152, w152 + 4800, std::ref(f32rng));
-  std::generate(w153, w153 + 120, std::ref(f32rng));
-  std::generate(w154, w154 + 3000, std::ref(f32rng));
-  std::generate(w155, w155 + 120, std::ref(f32rng));
-  std::generate(w156, w156 + 3840, std::ref(f32rng));
-  std::generate(w157, w157 + 32, std::ref(f32rng));
-  std::generate(w158, w158 + 3840, std::ref(f32rng));
-  std::generate(w159, w159 + 120, std::ref(f32rng));
-  std::generate(w160, w160 + 5760, std::ref(f32rng));
-  std::generate(w161, w161 + 48, std::ref(f32rng));
-  std::generate(w162, w162 + 6912, std::ref(f32rng));
-  std::generate(w163, w163 + 144, std::ref(f32rng));
-  std::generate(w164, w164 + 3600, std::ref(f32rng));
-  std::generate(w165, w165 + 144, std::ref(f32rng));
-  std::generate(w166, w166 + 5760, std::ref(f32rng));
-  std::generate(w167, w167 + 40, std::ref(f32rng));
-  std::generate(w168, w168 + 5760, std::ref(f32rng));
-  std::generate(w169, w169 + 144, std::ref(f32rng));
-  std::generate(w170, w170 + 6912, std::ref(f32rng));
-  std::generate(w171, w171 + 48, std::ref(f32rng));
-  std::generate(w172, w172 + 13824, std::ref(f32rng));
-  std::generate(w173, w173 + 288, std::ref(f32rng));
-  std::generate(w174, w174 + 7200, std::ref(f32rng));
-  std::generate(w175, w175 + 288, std::ref(f32rng));
-  std::generate(w176, w176 + 20736, std::ref(f32rng));
-  std::generate(w177, w177 + 72, std::ref(f32rng));
-  std::generate(w178, w178 + 20736, std::ref(f32rng));
-  std::generate(w179, w179 + 288, std::ref(f32rng));
-  std::generate(w180, w180 + 27648, std::ref(f32rng));
-  std::generate(w181, w181 + 96, std::ref(f32rng));
-  std::generate(w182, w182 + 55296, std::ref(f32rng));
-  std::generate(w183, w183 + 576, std::ref(f32rng));
-  std::generate(w184, w184 + 14400, std::ref(f32rng));
-  std::generate(w185, w185 + 576, std::ref(f32rng));
-  std::generate(w186, w186 + 82944, std::ref(f32rng));
-  std::generate(w187, w187 + 144, std::ref(f32rng));
-  std::generate(w188, w188 + 82944, std::ref(f32rng));
-  std::generate(w189, w189 + 576, std::ref(f32rng));
-  std::generate(w190, w190 + 55296, std::ref(f32rng));
-  std::generate(w191, w191 + 96, std::ref(f32rng));
-  std::generate(w192, w192 + 55296, std::ref(f32rng));
-  std::generate(w193, w193 + 576, std::ref(f32rng));
-  std::generate(w194, w194 + 14400, std::ref(f32rng));
-  std::generate(w195, w195 + 576, std::ref(f32rng));
-  std::generate(w196, w196 + 82944, std::ref(f32rng));
-  std::generate(w197, w197 + 144, std::ref(f32rng));
-  std::generate(w198, w198 + 82944, std::ref(f32rng));
-  std::generate(w199, w199 + 576, std::ref(f32rng));
-  std::generate(w200, w200 + 55296, std::ref(f32rng));
-  std::generate(w201, w201 + 96, std::ref(f32rng));
-  std::generate(w202, w202 + 55296, std::ref(f32rng));
-  std::generate(w203, w203 + 576, std::ref(f32rng));
-  std::generate(w204, w204 + 589824, std::ref(f32rng));
-  std::generate(w205, w205 + 1024, std::ref(f32rng));
-  std::generate(w206, w206 + 1025024, std::ref(f32rng));
-  std::generate(w207, w207 + 1001, std::ref(f32rng));
+  std::generate(v0.begin(), v0.end(), std::ref(f32rng));
+  std::generate(v1.begin(), v1.end(), std::ref(f32rng));
+  std::generate(v2.begin(), v2.end(), std::ref(f32rng));
+  std::generate(v3.begin(), v3.end(), std::ref(f32rng));
+  std::generate(v4.begin(), v4.end(), std::ref(f32rng));
+  std::generate(v5.begin(), v5.end(), std::ref(f32rng));
+  std::generate(v6.begin(), v6.end(), std::ref(f32rng));
+  std::generate(v7.begin(), v7.end(), std::ref(f32rng));
+  std::generate(v8.begin(), v8.end(), std::ref(f32rng));
+  std::generate(v9.begin(), v9.end(), std::ref(f32rng));
+  std::generate(v10.begin(), v10.end(), std::ref(f32rng));
+  std::generate(v11.begin(), v11.end(), std::ref(f32rng));
+  std::generate(v12.begin(), v12.end(), std::ref(f32rng));
+  std::generate(v13.begin(), v13.end(), std::ref(f32rng));
+  std::generate(v14.begin(), v14.end(), std::ref(f32rng));
+  std::generate(v15.begin(), v15.end(), std::ref(f32rng));
+  std::generate(v16.begin(), v16.end(), std::ref(f32rng));
+  std::generate(v17.begin(), v17.end(), std::ref(f32rng));
+  std::generate(v18.begin(), v18.end(), std::ref(f32rng));
+  std::generate(v19.begin(), v19.end(), std::ref(f32rng));
+  std::generate(v20.begin(), v20.end(), std::ref(f32rng));
+  std::generate(v21.begin(), v21.end(), std::ref(f32rng));
+  std::generate(v22.begin(), v22.end(), std::ref(f32rng));
+  std::generate(v23.begin(), v23.end(), std::ref(f32rng));
+  std::generate(v24.begin(), v24.end(), std::ref(f32rng));
+  std::generate(v25.begin(), v25.end(), std::ref(f32rng));
+  std::generate(v26.begin(), v26.end(), std::ref(f32rng));
+  std::generate(v27.begin(), v27.end(), std::ref(f32rng));
+  std::generate(v28.begin(), v28.end(), std::ref(f32rng));
+  std::generate(v29.begin(), v29.end(), std::ref(f32rng));
+  std::generate(v30.begin(), v30.end(), std::ref(f32rng));
+  std::generate(v31.begin(), v31.end(), std::ref(f32rng));
+  std::generate(v32.begin(), v32.end(), std::ref(f32rng));
+  std::generate(v33.begin(), v33.end(), std::ref(f32rng));
+  std::generate(v34.begin(), v34.end(), std::ref(f32rng));
+  std::generate(v35.begin(), v35.end(), std::ref(f32rng));
+  std::generate(v36.begin(), v36.end(), std::ref(f32rng));
+  std::generate(v37.begin(), v37.end(), std::ref(f32rng));
+  std::generate(v38.begin(), v38.end(), std::ref(f32rng));
+  std::generate(v39.begin(), v39.end(), std::ref(f32rng));
+  std::generate(v40.begin(), v40.end(), std::ref(f32rng));
+  std::generate(v41.begin(), v41.end(), std::ref(f32rng));
+  std::generate(v42.begin(), v42.end(), std::ref(f32rng));
+  std::generate(v43.begin(), v43.end(), std::ref(f32rng));
+  std::generate(v44.begin(), v44.end(), std::ref(f32rng));
+  std::generate(v45.begin(), v45.end(), std::ref(f32rng));
+  std::generate(v46.begin(), v46.end(), std::ref(f32rng));
+  std::generate(v47.begin(), v47.end(), std::ref(f32rng));
+  std::generate(v48.begin(), v48.end(), std::ref(f32rng));
+  std::generate(v49.begin(), v49.end(), std::ref(f32rng));
+  std::generate(v50.begin(), v50.end(), std::ref(f32rng));
+  std::generate(v51.begin(), v51.end(), std::ref(f32rng));
+  std::generate(v52.begin(), v52.end(), std::ref(f32rng));
+  std::generate(v53.begin(), v53.end(), std::ref(f32rng));
+  std::generate(v54.begin(), v54.end(), std::ref(f32rng));
+  std::generate(v55.begin(), v55.end(), std::ref(f32rng));
+  std::generate(v56.begin(), v56.end(), std::ref(f32rng));
+  std::generate(v57.begin(), v57.end(), std::ref(f32rng));
+  std::generate(v58.begin(), v58.end(), std::ref(f32rng));
+  std::generate(v59.begin(), v59.end(), std::ref(f32rng));
+  std::generate(v60.begin(), v60.end(), std::ref(f32rng));
+  std::generate(v61.begin(), v61.end(), std::ref(f32rng));
+  std::generate(v62.begin(), v62.end(), std::ref(f32rng));
+  std::generate(v63.begin(), v63.end(), std::ref(f32rng));
+  std::generate(v64.begin(), v64.end(), std::ref(f32rng));
+  std::generate(v65.begin(), v65.end(), std::ref(f32rng));
+  std::generate(v66.begin(), v66.end(), std::ref(f32rng));
+  std::generate(v67.begin(), v67.end(), std::ref(f32rng));
+  std::generate(v68.begin(), v68.end(), std::ref(f32rng));
+  std::generate(v69.begin(), v69.end(), std::ref(f32rng));
+  std::generate(v70.begin(), v70.end(), std::ref(f32rng));
+  std::generate(v71.begin(), v71.end(), std::ref(f32rng));
+  std::generate(v72.begin(), v72.end(), std::ref(f32rng));
+  std::generate(v73.begin(), v73.end(), std::ref(f32rng));
+  std::generate(v74.begin(), v74.end(), std::ref(f32rng));
+  std::generate(v75.begin(), v75.end(), std::ref(f32rng));
+  std::generate(v76.begin(), v76.end(), std::ref(f32rng));
+  std::generate(v77.begin(), v77.end(), std::ref(f32rng));
+  std::generate(v78.begin(), v78.end(), std::ref(f32rng));
+  std::generate(v79.begin(), v79.end(), std::ref(f32rng));
+  std::generate(v80.begin(), v80.end(), std::ref(f32rng));
+  std::generate(v81.begin(), v81.end(), std::ref(f32rng));
+  std::generate(v82.begin(), v82.end(), std::ref(f32rng));
+  std::generate(v83.begin(), v83.end(), std::ref(f32rng));
+  std::generate(v84.begin(), v84.end(), std::ref(f32rng));
+  std::generate(v85.begin(), v85.end(), std::ref(f32rng));
+  std::generate(v86.begin(), v86.end(), std::ref(f32rng));
+  std::generate(v87.begin(), v87.end(), std::ref(f32rng));
+  std::generate(v88.begin(), v88.end(), std::ref(f32rng));
+  std::generate(v89.begin(), v89.end(), std::ref(f32rng));
+  std::generate(v90.begin(), v90.end(), std::ref(f32rng));
+  std::generate(v91.begin(), v91.end(), std::ref(f32rng));
+  std::generate(v92.begin(), v92.end(), std::ref(f32rng));
+  std::generate(v93.begin(), v93.end(), std::ref(f32rng));
+  std::generate(v94.begin(), v94.end(), std::ref(f32rng));
+  std::generate(v95.begin(), v95.end(), std::ref(f32rng));
+  std::generate(v96.begin(), v96.end(), std::ref(f32rng));
+  std::generate(v97.begin(), v97.end(), std::ref(f32rng));
+  std::generate(v98.begin(), v98.end(), std::ref(f32rng));
+  std::generate(v99.begin(), v99.end(), std::ref(f32rng));
+  std::generate(w100.begin(), w100.end(), std::ref(f32rng));
+  std::generate(w101.begin(), w101.end(), std::ref(f32rng));
+  std::generate(w102.begin(), w102.end(), std::ref(f32rng));
+  std::generate(w103.begin(), w103.end(), std::ref(f32rng));
+  std::generate(w104.begin(), w104.end(), std::ref(f32rng));
+  std::generate(w105.begin(), w105.end(), std::ref(f32rng));
+  std::generate(w106.begin(), w106.end(), std::ref(f32rng));
+  std::generate(w107.begin(), w107.end(), std::ref(f32rng));
+  std::generate(w108.begin(), w108.end(), std::ref(f32rng));
+  std::generate(w109.begin(), w109.end(), std::ref(f32rng));
+  std::generate(w110.begin(), w110.end(), std::ref(f32rng));
+  std::generate(w111.begin(), w111.end(), std::ref(f32rng));
+  std::generate(w112.begin(), w112.end(), std::ref(f32rng));
+  std::generate(w113.begin(), w113.end(), std::ref(f32rng));
+  std::generate(w114.begin(), w114.end(), std::ref(f32rng));
+  std::generate(w115.begin(), w115.end(), std::ref(f32rng));
+  std::generate(w116.begin(), w116.end(), std::ref(f32rng));
+  std::generate(w117.begin(), w117.end(), std::ref(f32rng));
+  std::generate(w118.begin(), w118.end(), std::ref(f32rng));
+  std::generate(w119.begin(), w119.end(), std::ref(f32rng));
+  std::generate(w120.begin(), w120.end(), std::ref(f32rng));
+  std::generate(w121.begin(), w121.end(), std::ref(f32rng));
+  std::generate(w122.begin(), w122.end(), std::ref(f32rng));
+  std::generate(w123.begin(), w123.end(), std::ref(f32rng));
+  std::generate(w124.begin(), w124.end(), std::ref(f32rng));
+  std::generate(w125.begin(), w125.end(), std::ref(f32rng));
+  std::generate(w126.begin(), w126.end(), std::ref(f32rng));
+  std::generate(w127.begin(), w127.end(), std::ref(f32rng));
+  std::generate(w128.begin(), w128.end(), std::ref(f32rng));
+  std::generate(w129.begin(), w129.end(), std::ref(f32rng));
+  std::generate(w130.begin(), w130.end(), std::ref(f32rng));
+  std::generate(w131.begin(), w131.end(), std::ref(f32rng));
+  std::generate(w132.begin(), w132.end(), std::ref(f32rng));
+  std::generate(w133.begin(), w133.end(), std::ref(f32rng));
+  std::generate(w134.begin(), w134.end(), std::ref(f32rng));
+  std::generate(w135.begin(), w135.end(), std::ref(f32rng));
+  std::generate(w136.begin(), w136.end(), std::ref(f32rng));
+  std::generate(w137.begin(), w137.end(), std::ref(f32rng));
+  std::generate(w138.begin(), w138.end(), std::ref(f32rng));
+  std::generate(w139.begin(), w139.end(), std::ref(f32rng));
+  std::generate(w140.begin(), w140.end(), std::ref(f32rng));
+  std::generate(w141.begin(), w141.end(), std::ref(f32rng));
+  std::generate(w142.begin(), w142.end(), std::ref(f32rng));
+  std::generate(w143.begin(), w143.end(), std::ref(f32rng));
+  std::generate(w144.begin(), w144.end(), std::ref(f32rng));
+  std::generate(w145.begin(), w145.end(), std::ref(f32rng));
+  std::generate(w146.begin(), w146.end(), std::ref(f32rng));
+  std::generate(w147.begin(), w147.end(), std::ref(f32rng));
+  std::generate(w148.begin(), w148.end(), std::ref(f32rng));
+  std::generate(w149.begin(), w149.end(), std::ref(f32rng));
+  std::generate(w150.begin(), w150.end(), std::ref(f32rng));
+  std::generate(w151.begin(), w151.end(), std::ref(f32rng));
+  std::generate(w152.begin(), w152.end(), std::ref(f32rng));
+  std::generate(w153.begin(), w153.end(), std::ref(f32rng));
+  std::generate(w154.begin(), w154.end(), std::ref(f32rng));
+  std::generate(w155.begin(), w155.end(), std::ref(f32rng));
+  std::generate(w156.begin(), w156.end(), std::ref(f32rng));
+  std::generate(w157.begin(), w157.end(), std::ref(f32rng));
+  std::generate(w158.begin(), w158.end(), std::ref(f32rng));
+  std::generate(w159.begin(), w159.end(), std::ref(f32rng));
+  std::generate(w160.begin(), w160.end(), std::ref(f32rng));
+  std::generate(w161.begin(), w161.end(), std::ref(f32rng));
+  std::generate(w162.begin(), w162.end(), std::ref(f32rng));
+  std::generate(w163.begin(), w163.end(), std::ref(f32rng));
+  std::generate(w164.begin(), w164.end(), std::ref(f32rng));
+  std::generate(w165.begin(), w165.end(), std::ref(f32rng));
+  std::generate(w166.begin(), w166.end(), std::ref(f32rng));
+  std::generate(w167.begin(), w167.end(), std::ref(f32rng));
+  std::generate(w168.begin(), w168.end(), std::ref(f32rng));
+  std::generate(w169.begin(), w169.end(), std::ref(f32rng));
+  std::generate(w170.begin(), w170.end(), std::ref(f32rng));
+  std::generate(w171.begin(), w171.end(), std::ref(f32rng));
+  std::generate(w172.begin(), w172.end(), std::ref(f32rng));
+  std::generate(w173.begin(), w173.end(), std::ref(f32rng));
+  std::generate(w174.begin(), w174.end(), std::ref(f32rng));
+  std::generate(w175.begin(), w175.end(), std::ref(f32rng));
+  std::generate(w176.begin(), w176.end(), std::ref(f32rng));
+  std::generate(w177.begin(), w177.end(), std::ref(f32rng));
+  std::generate(w178.begin(), w178.end(), std::ref(f32rng));
+  std::generate(w179.begin(), w179.end(), std::ref(f32rng));
+  std::generate(w180.begin(), w180.end(), std::ref(f32rng));
+  std::generate(w181.begin(), w181.end(), std::ref(f32rng));
+  std::generate(w182.begin(), w182.end(), std::ref(f32rng));
+  std::generate(w183.begin(), w183.end(), std::ref(f32rng));
+  std::generate(w184.begin(), w184.end(), std::ref(f32rng));
+  std::generate(w185.begin(), w185.end(), std::ref(f32rng));
+  std::generate(w186.begin(), w186.end(), std::ref(f32rng));
+  std::generate(w187.begin(), w187.end(), std::ref(f32rng));
+  std::generate(w188.begin(), w188.end(), std::ref(f32rng));
+  std::generate(w189.begin(), w189.end(), std::ref(f32rng));
+  std::generate(w190.begin(), w190.end(), std::ref(f32rng));
+  std::generate(w191.begin(), w191.end(), std::ref(f32rng));
+  std::generate(w192.begin(), w192.end(), std::ref(f32rng));
+  std::generate(w193.begin(), w193.end(), std::ref(f32rng));
+  std::generate(w194.begin(), w194.end(), std::ref(f32rng));
+  std::generate(w195.begin(), w195.end(), std::ref(f32rng));
+  std::generate(w196.begin(), w196.end(), std::ref(f32rng));
+  std::generate(w197.begin(), w197.end(), std::ref(f32rng));
+  std::generate(w198.begin(), w198.end(), std::ref(f32rng));
+  std::generate(w199.begin(), w199.end(), std::ref(f32rng));
+  std::generate(w200.begin(), w200.end(), std::ref(f32rng));
+  std::generate(w201.begin(), w201.end(), std::ref(f32rng));
+  std::generate(w202.begin(), w202.end(), std::ref(f32rng));
+  std::generate(w203.begin(), w203.end(), std::ref(f32rng));
+  std::generate(w204.begin(), w204.end(), std::ref(f32rng));
+  std::generate(w205.begin(), w205.end(), std::ref(f32rng));
+  std::generate(w206.begin(), w206.end(), std::ref(f32rng));
+  std::generate(w207.begin(), w207.end(), std::ref(f32rng));
 
   ExecutionPlan operators;
   xnn_status status;
@@ -452,7 +453,7 @@
     16 /* output_channels_per_group */,
     3 /* input pixel stride */,
     16 /* output pixel stride */,
-    w100, w101,
+    w100.data(), w101.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op0);
@@ -487,7 +488,7 @@
     1 /* output_channels_per_group */,
     16 /* input pixel stride */,
     16 /* output pixel stride */,
-    w102, w103,
+    w102.data(), w103.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op2);
@@ -521,7 +522,7 @@
     8 /* output_channels_per_group */,
     16 /* input pixel stride */,
     8 /* output pixel stride */,
-    w104, w105,
+    w104.data(), w105.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op4);
@@ -543,7 +544,7 @@
     16 /* output_channels_per_group */,
     8 /* input pixel stride */,
     16 /* output pixel stride */,
-    w106, w107,
+    w106.data(), w107.data(),
     0.0f /* output min */, +0x1.00014Fp+0 /* output max */,
     0 /* flags */,
     &op5);
@@ -576,7 +577,7 @@
     16 /* output_channels_per_group */,
     16 /* input pixel stride */,
     16 /* output pixel stride */,
-    w108, w109,
+    w108.data(), w109.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op7);
@@ -598,7 +599,7 @@
     72 /* output_channels_per_group */,
     16 /* input pixel stride */,
     72 /* output pixel stride */,
-    w110, w111,
+    w110.data(), w111.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op8);
@@ -620,7 +621,7 @@
     1 /* output_channels_per_group */,
     72 /* input pixel stride */,
     72 /* output pixel stride */,
-    w112, w113,
+    w112.data(), w113.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op9);
@@ -642,7 +643,7 @@
     24 /* output_channels_per_group */,
     72 /* input pixel stride */,
     24 /* output pixel stride */,
-    w114, w115,
+    w114.data(), w115.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op10);
@@ -664,7 +665,7 @@
     88 /* output_channels_per_group */,
     24 /* input pixel stride */,
     88 /* output pixel stride */,
-    w116, w117,
+    w116.data(), w117.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op11);
@@ -686,7 +687,7 @@
     1 /* output_channels_per_group */,
     88 /* input pixel stride */,
     88 /* output pixel stride */,
-    w118, w119,
+    w118.data(), w119.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op12);
@@ -708,7 +709,7 @@
     24 /* output_channels_per_group */,
     88 /* input pixel stride */,
     24 /* output pixel stride */,
-    w120, w121,
+    w120.data(), w121.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op13);
@@ -741,7 +742,7 @@
     96 /* output_channels_per_group */,
     24 /* input pixel stride */,
     96 /* output pixel stride */,
-    w122, w123,
+    w122.data(), w123.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op15);
@@ -776,7 +777,7 @@
     1 /* output_channels_per_group */,
     96 /* input pixel stride */,
     96 /* output pixel stride */,
-    w124, w125,
+    w124.data(), w125.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op17);
@@ -823,7 +824,7 @@
     24 /* output_channels_per_group */,
     96 /* input pixel stride */,
     24 /* output pixel stride */,
-    w126, w127,
+    w126.data(), w127.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op20);
@@ -845,7 +846,7 @@
     96 /* output_channels_per_group */,
     24 /* input pixel stride */,
     96 /* output pixel stride */,
-    w128, w129,
+    w128.data(), w129.data(),
     0.0f /* output min */, +0x1.00014Fp+0 /* output max */,
     0 /* flags */,
     &op21);
@@ -878,7 +879,7 @@
     40 /* output_channels_per_group */,
     96 /* input pixel stride */,
     40 /* output pixel stride */,
-    w130, w131,
+    w130.data(), w131.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op23);
@@ -900,7 +901,7 @@
     240 /* output_channels_per_group */,
     40 /* input pixel stride */,
     240 /* output pixel stride */,
-    w132, w133,
+    w132.data(), w133.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op24);
@@ -935,7 +936,7 @@
     1 /* output_channels_per_group */,
     240 /* input pixel stride */,
     240 /* output pixel stride */,
-    w134, w135,
+    w134.data(), w135.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op26);
@@ -982,7 +983,7 @@
     64 /* output_channels_per_group */,
     240 /* input pixel stride */,
     64 /* output pixel stride */,
-    w136, w137,
+    w136.data(), w137.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op29);
@@ -1004,7 +1005,7 @@
     240 /* output_channels_per_group */,
     64 /* input pixel stride */,
     240 /* output pixel stride */,
-    w138, w139,
+    w138.data(), w139.data(),
     0.0f /* output min */, +0x1.00014Fp+0 /* output max */,
     0 /* flags */,
     &op30);
@@ -1037,7 +1038,7 @@
     40 /* output_channels_per_group */,
     240 /* input pixel stride */,
     40 /* output pixel stride */,
-    w140, w141,
+    w140.data(), w141.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op32);
@@ -1070,7 +1071,7 @@
     240 /* output_channels_per_group */,
     40 /* input pixel stride */,
     240 /* output pixel stride */,
-    w142, w143,
+    w142.data(), w143.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op34);
@@ -1105,7 +1106,7 @@
     1 /* output_channels_per_group */,
     240 /* input pixel stride */,
     240 /* output pixel stride */,
-    w144, w145,
+    w144.data(), w145.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op36);
@@ -1152,7 +1153,7 @@
     64 /* output_channels_per_group */,
     240 /* input pixel stride */,
     64 /* output pixel stride */,
-    w146, w147,
+    w146.data(), w147.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op39);
@@ -1174,7 +1175,7 @@
     240 /* output_channels_per_group */,
     64 /* input pixel stride */,
     240 /* output pixel stride */,
-    w148, w149,
+    w148.data(), w149.data(),
     0.0f /* output min */, +0x1.00014Fp+0 /* output max */,
     0 /* flags */,
     &op40);
@@ -1207,7 +1208,7 @@
     40 /* output_channels_per_group */,
     240 /* input pixel stride */,
     40 /* output pixel stride */,
-    w150, w151,
+    w150.data(), w151.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op42);
@@ -1240,7 +1241,7 @@
     120 /* output_channels_per_group */,
     40 /* input pixel stride */,
     120 /* output pixel stride */,
-    w152, w153,
+    w152.data(), w153.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op44);
@@ -1275,7 +1276,7 @@
     1 /* output_channels_per_group */,
     120 /* input pixel stride */,
     120 /* output pixel stride */,
-    w154, w155,
+    w154.data(), w155.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op46);
@@ -1322,7 +1323,7 @@
     32 /* output_channels_per_group */,
     120 /* input pixel stride */,
     32 /* output pixel stride */,
-    w156, w157,
+    w156.data(), w157.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op49);
@@ -1344,7 +1345,7 @@
     120 /* output_channels_per_group */,
     32 /* input pixel stride */,
     120 /* output pixel stride */,
-    w158, w159,
+    w158.data(), w159.data(),
     0.0f /* output min */, +0x1.00014Fp+0 /* output max */,
     0 /* flags */,
     &op50);
@@ -1377,7 +1378,7 @@
     48 /* output_channels_per_group */,
     120 /* input pixel stride */,
     48 /* output pixel stride */,
-    w160, w161,
+    w160.data(), w161.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op52);
@@ -1399,7 +1400,7 @@
     144 /* output_channels_per_group */,
     48 /* input pixel stride */,
     144 /* output pixel stride */,
-    w162, w163,
+    w162.data(), w163.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op53);
@@ -1434,7 +1435,7 @@
     1 /* output_channels_per_group */,
     144 /* input pixel stride */,
     144 /* output pixel stride */,
-    w164, w165,
+    w164.data(), w165.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op55);
@@ -1481,7 +1482,7 @@
     40 /* output_channels_per_group */,
     144 /* input pixel stride */,
     40 /* output pixel stride */,
-    w166, w167,
+    w166.data(), w167.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op58);
@@ -1503,7 +1504,7 @@
     144 /* output_channels_per_group */,
     40 /* input pixel stride */,
     144 /* output pixel stride */,
-    w168, w169,
+    w168.data(), w169.data(),
     0.0f /* output min */, +0x1.00014Fp+0 /* output max */,
     0 /* flags */,
     &op59);
@@ -1536,7 +1537,7 @@
     48 /* output_channels_per_group */,
     144 /* input pixel stride */,
     48 /* output pixel stride */,
-    w170, w171,
+    w170.data(), w171.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op61);
@@ -1569,7 +1570,7 @@
     288 /* output_channels_per_group */,
     48 /* input pixel stride */,
     288 /* output pixel stride */,
-    w172, w173,
+    w172.data(), w173.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op63);
@@ -1604,7 +1605,7 @@
     1 /* output_channels_per_group */,
     288 /* input pixel stride */,
     288 /* output pixel stride */,
-    w174, w175,
+    w174.data(), w175.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op65);
@@ -1651,7 +1652,7 @@
     72 /* output_channels_per_group */,
     288 /* input pixel stride */,
     72 /* output pixel stride */,
-    w176, w177,
+    w176.data(), w177.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op68);
@@ -1673,7 +1674,7 @@
     288 /* output_channels_per_group */,
     72 /* input pixel stride */,
     288 /* output pixel stride */,
-    w178, w179,
+    w178.data(), w179.data(),
     0.0f /* output min */, +0x1.00014Fp+0 /* output max */,
     0 /* flags */,
     &op69);
@@ -1706,7 +1707,7 @@
     96 /* output_channels_per_group */,
     288 /* input pixel stride */,
     96 /* output pixel stride */,
-    w180, w181,
+    w180.data(), w181.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op71);
@@ -1728,7 +1729,7 @@
     576 /* output_channels_per_group */,
     96 /* input pixel stride */,
     576 /* output pixel stride */,
-    w182, w183,
+    w182.data(), w183.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op72);
@@ -1763,7 +1764,7 @@
     1 /* output_channels_per_group */,
     576 /* input pixel stride */,
     576 /* output pixel stride */,
-    w184, w185,
+    w184.data(), w185.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op74);
@@ -1810,7 +1811,7 @@
     144 /* output_channels_per_group */,
     576 /* input pixel stride */,
     144 /* output pixel stride */,
-    w186, w187,
+    w186.data(), w187.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op77);
@@ -1832,7 +1833,7 @@
     576 /* output_channels_per_group */,
     144 /* input pixel stride */,
     576 /* output pixel stride */,
-    w188, w189,
+    w188.data(), w189.data(),
     0.0f /* output min */, +0x1.00014Fp+0 /* output max */,
     0 /* flags */,
     &op78);
@@ -1865,7 +1866,7 @@
     96 /* output_channels_per_group */,
     576 /* input pixel stride */,
     96 /* output pixel stride */,
-    w190, w191,
+    w190.data(), w191.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op80);
@@ -1898,7 +1899,7 @@
     576 /* output_channels_per_group */,
     96 /* input pixel stride */,
     576 /* output pixel stride */,
-    w192, w193,
+    w192.data(), w193.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op82);
@@ -1933,7 +1934,7 @@
     1 /* output_channels_per_group */,
     576 /* input pixel stride */,
     576 /* output pixel stride */,
-    w194, w195,
+    w194.data(), w195.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op84);
@@ -1980,7 +1981,7 @@
     144 /* output_channels_per_group */,
     576 /* input pixel stride */,
     144 /* output pixel stride */,
-    w196, w197,
+    w196.data(), w197.data(),
     0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op87);
@@ -2002,7 +2003,7 @@
     576 /* output_channels_per_group */,
     144 /* input pixel stride */,
     576 /* output pixel stride */,
-    w198, w199,
+    w198.data(), w199.data(),
     0.0f /* output min */, +0x1.00014Fp+0 /* output max */,
     0 /* flags */,
     &op88);
@@ -2035,7 +2036,7 @@
     96 /* output_channels_per_group */,
     576 /* input pixel stride */,
     96 /* output pixel stride */,
-    w200, w201,
+    w200.data(), w201.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op90);
@@ -2068,7 +2069,7 @@
     576 /* output_channels_per_group */,
     96 /* input pixel stride */,
     576 /* output pixel stride */,
-    w202, w203,
+    w202.data(), w203.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op92);
@@ -2115,7 +2116,7 @@
     1024 /* output_channels_per_group */,
     576 /* input pixel stride */,
     1024 /* output pixel stride */,
-    w204, w205,
+    w204.data(), w205.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op95);
@@ -2162,7 +2163,7 @@
     1001 /* output_channels_per_group */,
     1024 /* input pixel stride */,
     1001 /* output pixel stride */,
-    w206, w207,
+    w206.data(), w207.data(),
     -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
     0 /* flags */,
     &op98);
@@ -2177,7 +2178,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op0,
     1 /* batch size */, 224 /* input height */, 224 /* input width */,
-    v0 /* input */, v1 /* output */,
+    v0.data() /* input */, v1.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #0" << std::endl;
@@ -2187,7 +2188,7 @@
   status = xnn_setup_hardswish_nc_f32(
     op1,
     12544 /* batch size */,
-    v1 /* input */, v2 /* output */,
+    v1.data() /* input */, v2.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #1" << std::endl;
@@ -2197,7 +2198,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op2,
     1 /* batch size */, 112 /* input height */, 112 /* input width */,
-    v2 /* input */, v3 /* output */,
+    v2.data() /* input */, v3.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #2" << std::endl;
@@ -2207,7 +2208,7 @@
   status = xnn_setup_global_average_pooling_nwc_f32(
     op3,
     1 /* batch size */, 3136 /* width */,
-    v3 /* input */, v4 /* output */,
+    v3.data() /* input */, v4.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #3" << std::endl;
@@ -2217,7 +2218,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op4,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v4 /* input */, v5 /* output */,
+    v4.data() /* input */, v5.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #4" << std::endl;
@@ -2227,7 +2228,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op5,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v5 /* input */, v6 /* output */,
+    v5.data() /* input */, v6.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #5" << std::endl;
@@ -2240,7 +2241,7 @@
     status = xnn_setup_multiply_nd_f32(
       op6,
       4, a_shape, 4, b_shape,
-      v3 /* a */, v6 /* b */, v7 /* output */,
+      v3.data() /* a */, v6.data() /* b */, v7.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2251,7 +2252,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op7,
     1 /* batch size */, 56 /* input height */, 56 /* input width */,
-    v7 /* input */, v8 /* output */,
+    v7.data() /* input */, v8.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #7" << std::endl;
@@ -2261,7 +2262,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op8,
     1 /* batch size */, 56 /* input height */, 56 /* input width */,
-    v8 /* input */, v9 /* output */,
+    v8.data() /* input */, v9.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #8" << std::endl;
@@ -2271,7 +2272,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op9,
     1 /* batch size */, 56 /* input height */, 56 /* input width */,
-    v9 /* input */, v10 /* output */,
+    v9.data() /* input */, v10.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #9" << std::endl;
@@ -2281,7 +2282,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op10,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v10 /* input */, v11 /* output */,
+    v10.data() /* input */, v11.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #10" << std::endl;
@@ -2291,7 +2292,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op11,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v11 /* input */, v12 /* output */,
+    v11.data() /* input */, v12.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #11" << std::endl;
@@ -2301,7 +2302,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op12,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v12 /* input */, v13 /* output */,
+    v12.data() /* input */, v13.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #12" << std::endl;
@@ -2311,7 +2312,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op13,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v13 /* input */, v14 /* output */,
+    v13.data() /* input */, v14.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #13" << std::endl;
@@ -2324,7 +2325,7 @@
     status = xnn_setup_add_nd_f32(
       op14,
       4, a_shape, 4, b_shape,
-      v14 /* a */, v11 /* b */, v15 /* output */,
+      v14.data() /* a */, v11.data() /* b */, v15.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2335,7 +2336,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op15,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v15 /* input */, v16 /* output */,
+    v15.data() /* input */, v16.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #15" << std::endl;
@@ -2345,7 +2346,7 @@
   status = xnn_setup_hardswish_nc_f32(
     op16,
     784 /* batch size */,
-    v16 /* input */, v17 /* output */,
+    v16.data() /* input */, v17.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #16" << std::endl;
@@ -2355,7 +2356,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op17,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v17 /* input */, v18 /* output */,
+    v17.data() /* input */, v18.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #17" << std::endl;
@@ -2365,7 +2366,7 @@
   status = xnn_setup_hardswish_nc_f32(
     op18,
     196 /* batch size */,
-    v18 /* input */, v19 /* output */,
+    v18.data() /* input */, v19.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #18" << std::endl;
@@ -2375,7 +2376,7 @@
   status = xnn_setup_global_average_pooling_nwc_f32(
     op19,
     1 /* batch size */, 196 /* width */,
-    v19 /* input */, v20 /* output */,
+    v19.data() /* input */, v20.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #19" << std::endl;
@@ -2385,7 +2386,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op20,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v20 /* input */, v21 /* output */,
+    v20.data() /* input */, v21.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #20" << std::endl;
@@ -2395,7 +2396,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op21,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v21 /* input */, v22 /* output */,
+    v21.data() /* input */, v22.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #21" << std::endl;
@@ -2408,7 +2409,7 @@
     status = xnn_setup_multiply_nd_f32(
       op22,
       4, a_shape, 4, b_shape,
-      v19 /* a */, v22 /* b */, v23 /* output */,
+      v19.data() /* a */, v22.data() /* b */, v23.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2419,7 +2420,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op23,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v23 /* input */, v24 /* output */,
+    v23.data() /* input */, v24.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #23" << std::endl;
@@ -2429,7 +2430,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op24,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v24 /* input */, v25 /* output */,
+    v24.data() /* input */, v25.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #24" << std::endl;
@@ -2439,7 +2440,7 @@
   status = xnn_setup_hardswish_nc_f32(
     op25,
     196 /* batch size */,
-    v25 /* input */, v26 /* output */,
+    v25.data() /* input */, v26.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #25" << std::endl;
@@ -2449,7 +2450,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op26,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v26 /* input */, v27 /* output */,
+    v26.data() /* input */, v27.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #26" << std::endl;
@@ -2459,7 +2460,7 @@
   status = xnn_setup_hardswish_nc_f32(
     op27,
     196 /* batch size */,
-    v27 /* input */, v28 /* output */,
+    v27.data() /* input */, v28.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #27" << std::endl;
@@ -2469,7 +2470,7 @@
   status = xnn_setup_global_average_pooling_nwc_f32(
     op28,
     1 /* batch size */, 196 /* width */,
-    v28 /* input */, v29 /* output */,
+    v28.data() /* input */, v29.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #28" << std::endl;
@@ -2479,7 +2480,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op29,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v29 /* input */, v30 /* output */,
+    v29.data() /* input */, v30.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #29" << std::endl;
@@ -2489,7 +2490,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op30,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v30 /* input */, v31 /* output */,
+    v30.data() /* input */, v31.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #30" << std::endl;
@@ -2502,7 +2503,7 @@
     status = xnn_setup_multiply_nd_f32(
       op31,
       4, a_shape, 4, b_shape,
-      v28 /* a */, v31 /* b */, v32 /* output */,
+      v28.data() /* a */, v31.data() /* b */, v32.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2513,7 +2514,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op32,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v32 /* input */, v33 /* output */,
+    v32.data() /* input */, v33.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #32" << std::endl;
@@ -2526,7 +2527,7 @@
     status = xnn_setup_add_nd_f32(
       op33,
       4, a_shape, 4, b_shape,
-      v33 /* a */, v24 /* b */, v34 /* output */,
+      v33.data() /* a */, v24.data() /* b */, v34.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2537,7 +2538,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op34,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v34 /* input */, v35 /* output */,
+    v34.data() /* input */, v35.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #34" << std::endl;
@@ -2547,7 +2548,7 @@
   status = xnn_setup_hardswish_nc_f32(
     op35,
     196 /* batch size */,
-    v35 /* input */, v36 /* output */,
+    v35.data() /* input */, v36.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #35" << std::endl;
@@ -2557,7 +2558,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op36,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v36 /* input */, v37 /* output */,
+    v36.data() /* input */, v37.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #36" << std::endl;
@@ -2567,7 +2568,7 @@
   status = xnn_setup_hardswish_nc_f32(
     op37,
     196 /* batch size */,
-    v37 /* input */, v38 /* output */,
+    v37.data() /* input */, v38.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #37" << std::endl;
@@ -2577,7 +2578,7 @@
   status = xnn_setup_global_average_pooling_nwc_f32(
     op38,
     1 /* batch size */, 196 /* width */,
-    v38 /* input */, v39 /* output */,
+    v38.data() /* input */, v39.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #38" << std::endl;
@@ -2587,7 +2588,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op39,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v39 /* input */, v40 /* output */,
+    v39.data() /* input */, v40.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #39" << std::endl;
@@ -2597,7 +2598,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op40,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v40 /* input */, v41 /* output */,
+    v40.data() /* input */, v41.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #40" << std::endl;
@@ -2610,7 +2611,7 @@
     status = xnn_setup_multiply_nd_f32(
       op41,
       4, a_shape, 4, b_shape,
-      v38 /* a */, v41 /* b */, v42 /* output */,
+      v38.data() /* a */, v41.data() /* b */, v42.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2621,7 +2622,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op42,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v42 /* input */, v43 /* output */,
+    v42.data() /* input */, v43.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #42" << std::endl;
@@ -2634,7 +2635,7 @@
     status = xnn_setup_add_nd_f32(
       op43,
       4, a_shape, 4, b_shape,
-      v43 /* a */, v34 /* b */, v44 /* output */,
+      v43.data() /* a */, v34.data() /* b */, v44.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2645,7 +2646,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op44,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v44 /* input */, v45 /* output */,
+    v44.data() /* input */, v45.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #44" << std::endl;
@@ -2655,7 +2656,7 @@
   status = xnn_setup_hardswish_nc_f32(
     op45,
     196 /* batch size */,
-    v45 /* input */, v46 /* output */,
+    v45.data() /* input */, v46.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #45" << std::endl;
@@ -2665,7 +2666,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op46,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v46 /* input */, v47 /* output */,
+    v46.data() /* input */, v47.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #46" << std::endl;
@@ -2675,7 +2676,7 @@
   status = xnn_setup_hardswish_nc_f32(
     op47,
     196 /* batch size */,
-    v47 /* input */, v48 /* output */,
+    v47.data() /* input */, v48.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #47" << std::endl;
@@ -2685,7 +2686,7 @@
   status = xnn_setup_global_average_pooling_nwc_f32(
     op48,
     1 /* batch size */, 196 /* width */,
-    v48 /* input */, v49 /* output */,
+    v48.data() /* input */, v49.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #48" << std::endl;
@@ -2695,7 +2696,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op49,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v49 /* input */, v50 /* output */,
+    v49.data() /* input */, v50.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #49" << std::endl;
@@ -2705,7 +2706,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op50,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v50 /* input */, v51 /* output */,
+    v50.data() /* input */, v51.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #50" << std::endl;
@@ -2718,7 +2719,7 @@
     status = xnn_setup_multiply_nd_f32(
       op51,
       4, a_shape, 4, b_shape,
-      v48 /* a */, v51 /* b */, v52 /* output */,
+      v48.data() /* a */, v51.data() /* b */, v52.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2729,7 +2730,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op52,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v52 /* input */, v53 /* output */,
+    v52.data() /* input */, v53.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #52" << std::endl;
@@ -2739,7 +2740,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op53,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v53 /* input */, v54 /* output */,
+    v53.data() /* input */, v54.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #53" << std::endl;
@@ -2749,7 +2750,7 @@
   status = xnn_setup_hardswish_nc_f32(
     op54,
     196 /* batch size */,
-    v54 /* input */, v55 /* output */,
+    v54.data() /* input */, v55.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #54" << std::endl;
@@ -2759,7 +2760,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op55,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v55 /* input */, v56 /* output */,
+    v55.data() /* input */, v56.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #55" << std::endl;
@@ -2769,7 +2770,7 @@
   status = xnn_setup_hardswish_nc_f32(
     op56,
     196 /* batch size */,
-    v56 /* input */, v57 /* output */,
+    v56.data() /* input */, v57.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #56" << std::endl;
@@ -2779,7 +2780,7 @@
   status = xnn_setup_global_average_pooling_nwc_f32(
     op57,
     1 /* batch size */, 196 /* width */,
-    v57 /* input */, v58 /* output */,
+    v57.data() /* input */, v58.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #57" << std::endl;
@@ -2789,7 +2790,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op58,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v58 /* input */, v59 /* output */,
+    v58.data() /* input */, v59.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #58" << std::endl;
@@ -2799,7 +2800,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op59,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v59 /* input */, v60 /* output */,
+    v59.data() /* input */, v60.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #59" << std::endl;
@@ -2812,7 +2813,7 @@
     status = xnn_setup_multiply_nd_f32(
       op60,
       4, a_shape, 4, b_shape,
-      v57 /* a */, v60 /* b */, v61 /* output */,
+      v57.data() /* a */, v60.data() /* b */, v61.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2823,7 +2824,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op61,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v61 /* input */, v62 /* output */,
+    v61.data() /* input */, v62.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #61" << std::endl;
@@ -2836,7 +2837,7 @@
     status = xnn_setup_add_nd_f32(
       op62,
       4, a_shape, 4, b_shape,
-      v62 /* a */, v53 /* b */, v63 /* output */,
+      v62.data() /* a */, v53.data() /* b */, v63.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2847,7 +2848,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op63,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v63 /* input */, v64 /* output */,
+    v63.data() /* input */, v64.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #63" << std::endl;
@@ -2857,7 +2858,7 @@
   status = xnn_setup_hardswish_nc_f32(
     op64,
     196 /* batch size */,
-    v64 /* input */, v65 /* output */,
+    v64.data() /* input */, v65.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #64" << std::endl;
@@ -2867,7 +2868,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op65,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v65 /* input */, v66 /* output */,
+    v65.data() /* input */, v66.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #65" << std::endl;
@@ -2877,7 +2878,7 @@
   status = xnn_setup_hardswish_nc_f32(
     op66,
     49 /* batch size */,
-    v66 /* input */, v67 /* output */,
+    v66.data() /* input */, v67.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #66" << std::endl;
@@ -2887,7 +2888,7 @@
   status = xnn_setup_global_average_pooling_nwc_f32(
     op67,
     1 /* batch size */, 49 /* width */,
-    v67 /* input */, v68 /* output */,
+    v67.data() /* input */, v68.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #67" << std::endl;
@@ -2897,7 +2898,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op68,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v68 /* input */, v69 /* output */,
+    v68.data() /* input */, v69.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #68" << std::endl;
@@ -2907,7 +2908,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op69,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v69 /* input */, v70 /* output */,
+    v69.data() /* input */, v70.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #69" << std::endl;
@@ -2920,7 +2921,7 @@
     status = xnn_setup_multiply_nd_f32(
       op70,
       4, a_shape, 4, b_shape,
-      v67 /* a */, v70 /* b */, v71 /* output */,
+      v67.data() /* a */, v70.data() /* b */, v71.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2931,7 +2932,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op71,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v71 /* input */, v72 /* output */,
+    v71.data() /* input */, v72.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #71" << std::endl;
@@ -2941,7 +2942,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op72,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v72 /* input */, v73 /* output */,
+    v72.data() /* input */, v73.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #72" << std::endl;
@@ -2951,7 +2952,7 @@
   status = xnn_setup_hardswish_nc_f32(
     op73,
     49 /* batch size */,
-    v73 /* input */, v74 /* output */,
+    v73.data() /* input */, v74.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #73" << std::endl;
@@ -2961,7 +2962,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op74,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v74 /* input */, v75 /* output */,
+    v74.data() /* input */, v75.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #74" << std::endl;
@@ -2971,7 +2972,7 @@
   status = xnn_setup_hardswish_nc_f32(
     op75,
     49 /* batch size */,
-    v75 /* input */, v76 /* output */,
+    v75.data() /* input */, v76.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #75" << std::endl;
@@ -2981,7 +2982,7 @@
   status = xnn_setup_global_average_pooling_nwc_f32(
     op76,
     1 /* batch size */, 49 /* width */,
-    v76 /* input */, v77 /* output */,
+    v76.data() /* input */, v77.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #76" << std::endl;
@@ -2991,7 +2992,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op77,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v77 /* input */, v78 /* output */,
+    v77.data() /* input */, v78.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #77" << std::endl;
@@ -3001,7 +3002,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op78,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v78 /* input */, v79 /* output */,
+    v78.data() /* input */, v79.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #78" << std::endl;
@@ -3014,7 +3015,7 @@
     status = xnn_setup_multiply_nd_f32(
       op79,
       4, a_shape, 4, b_shape,
-      v76 /* a */, v79 /* b */, v80 /* output */,
+      v76.data() /* a */, v79.data() /* b */, v80.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -3025,7 +3026,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op80,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v80 /* input */, v81 /* output */,
+    v80.data() /* input */, v81.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #80" << std::endl;
@@ -3038,7 +3039,7 @@
     status = xnn_setup_add_nd_f32(
       op81,
       4, a_shape, 4, b_shape,
-      v81 /* a */, v72 /* b */, v82 /* output */,
+      v81.data() /* a */, v72.data() /* b */, v82.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -3049,7 +3050,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op82,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v82 /* input */, v83 /* output */,
+    v82.data() /* input */, v83.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #82" << std::endl;
@@ -3059,7 +3060,7 @@
   status = xnn_setup_hardswish_nc_f32(
     op83,
     49 /* batch size */,
-    v83 /* input */, v84 /* output */,
+    v83.data() /* input */, v84.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #83" << std::endl;
@@ -3069,7 +3070,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op84,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v84 /* input */, v85 /* output */,
+    v84.data() /* input */, v85.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #84" << std::endl;
@@ -3079,7 +3080,7 @@
   status = xnn_setup_hardswish_nc_f32(
     op85,
     49 /* batch size */,
-    v85 /* input */, v86 /* output */,
+    v85.data() /* input */, v86.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #85" << std::endl;
@@ -3089,7 +3090,7 @@
   status = xnn_setup_global_average_pooling_nwc_f32(
     op86,
     1 /* batch size */, 49 /* width */,
-    v86 /* input */, v87 /* output */,
+    v86.data() /* input */, v87.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #86" << std::endl;
@@ -3099,7 +3100,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op87,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v87 /* input */, v88 /* output */,
+    v87.data() /* input */, v88.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #87" << std::endl;
@@ -3109,7 +3110,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op88,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v88 /* input */, v89 /* output */,
+    v88.data() /* input */, v89.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #88" << std::endl;
@@ -3122,7 +3123,7 @@
     status = xnn_setup_multiply_nd_f32(
       op89,
       4, a_shape, 4, b_shape,
-      v86 /* a */, v89 /* b */, v90 /* output */,
+      v86.data() /* a */, v89.data() /* b */, v90.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -3133,7 +3134,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op90,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v90 /* input */, v91 /* output */,
+    v90.data() /* input */, v91.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #90" << std::endl;
@@ -3146,7 +3147,7 @@
     status = xnn_setup_add_nd_f32(
       op91,
       4, a_shape, 4, b_shape,
-      v91 /* a */, v82 /* b */, v92 /* output */,
+      v91.data() /* a */, v82.data() /* b */, v92.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -3157,7 +3158,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op92,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v92 /* input */, v93 /* output */,
+    v92.data() /* input */, v93.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #92" << std::endl;
@@ -3167,7 +3168,7 @@
   status = xnn_setup_hardswish_nc_f32(
     op93,
     49 /* batch size */,
-    v93 /* input */, v94 /* output */,
+    v93.data() /* input */, v94.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #93" << std::endl;
@@ -3177,7 +3178,7 @@
   status = xnn_setup_global_average_pooling_nwc_f32(
     op94,
     1 /* batch size */, 49 /* width */,
-    v94 /* input */, v95 /* output */,
+    v94.data() /* input */, v95.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #94" << std::endl;
@@ -3187,7 +3188,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op95,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v95 /* input */, v96 /* output */,
+    v95.data() /* input */, v96.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #95" << std::endl;
@@ -3197,7 +3198,7 @@
   status = xnn_setup_hardswish_nc_f32(
     op96,
     1 /* batch size */,
-    v96 /* input */, v97 /* output */,
+    v96.data() /* input */, v97.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #96" << std::endl;
@@ -3207,7 +3208,7 @@
   status = xnn_setup_global_average_pooling_nwc_f32(
     op97,
     1 /* batch size */, 1 /* width */,
-    v97 /* input */, v98 /* output */,
+    v97.data() /* input */, v98.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #97" << std::endl;
@@ -3217,7 +3218,7 @@
   status = xnn_setup_convolution2d_nhwc_f32(
     op98,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v98 /* input */, v99 /* output */,
+    v98.data() /* input */, v99.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #98" << std::endl;
diff --git a/models/fp32-sparse-mobilenet-v1.cc b/models/fp32-sparse-mobilenet-v1.cc
new file mode 100644
index 0000000..aeb285f
--- /dev/null
+++ b/models/fp32-sparse-mobilenet-v1.cc
@@ -0,0 +1,1152 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack.h>
+
+#include <array>
+#include <algorithm>
+#include <functional>
+#include <iostream>
+#include <limits>
+#include <random>
+
+#include "models/models.h"
+
+namespace models {
+
+ExecutionPlan FP32SparseMobileNetV1(float sparsity, pthreadpool_t threadpool) {
+  alignas(16) static std::array<float, 150528> v0;
+  alignas(16) static std::array<float, 401408> v1;
+  alignas(16) static std::array<float, 401408> v2;
+  alignas(16) static std::array<float, 802816> v3;
+  alignas(16) static std::array<float, 200704> v4;
+  alignas(16) static std::array<float, 401408> v5;
+  alignas(16) static std::array<float, 401408> v6;
+  alignas(16) static std::array<float, 401408> v7;
+  alignas(16) static std::array<float, 100352> v8;
+  alignas(16) static std::array<float, 200704> v9;
+  alignas(16) static std::array<float, 200704> v10;
+  alignas(16) static std::array<float, 200704> v11;
+  alignas(16) static std::array<float, 50176> v12;
+  alignas(16) static std::array<float, 100352> v13;
+  alignas(16) static std::array<float, 100352> v14;
+  alignas(16) static std::array<float, 100352> v15;
+  alignas(16) static std::array<float, 100352> v16;
+  alignas(16) static std::array<float, 100352> v17;
+  alignas(16) static std::array<float, 100352> v18;
+  alignas(16) static std::array<float, 100352> v19;
+  alignas(16) static std::array<float, 100352> v20;
+  alignas(16) static std::array<float, 100352> v21;
+  alignas(16) static std::array<float, 100352> v22;
+  alignas(16) static std::array<float, 100352> v23;
+  alignas(16) static std::array<float, 25088> v24;
+  alignas(16) static std::array<float, 50176> v25;
+  alignas(16) static std::array<float, 50176> v26;
+  alignas(16) static std::array<float, 50176> v27;
+  alignas(16) static std::array<float, 1024> v28;
+  alignas(16) static std::array<float, 1001> v29;
+  alignas(16) static std::array<float, 864> w30;
+  alignas(16) static std::array<float, 32> w31;
+  alignas(16) static std::array<float, 288> w32;
+  alignas(16) static std::array<float, 32> w33;
+  alignas(16) static std::array<float, 2048> w34;
+  alignas(16) static std::array<float, 64> w35;
+  alignas(16) static std::array<float, 576> w36;
+  alignas(16) static std::array<float, 64> w37;
+  alignas(16) static std::array<float, 8192> w38;
+  alignas(16) static std::array<float, 128> w39;
+  alignas(16) static std::array<float, 1152> w40;
+  alignas(16) static std::array<float, 128> w41;
+  alignas(16) static std::array<float, 16384> w42;
+  alignas(16) static std::array<float, 128> w43;
+  alignas(16) static std::array<float, 1152> w44;
+  alignas(16) static std::array<float, 128> w45;
+  alignas(16) static std::array<float, 32768> w46;
+  alignas(16) static std::array<float, 256> w47;
+  alignas(16) static std::array<float, 2304> w48;
+  alignas(16) static std::array<float, 256> w49;
+  alignas(16) static std::array<float, 65536> w50;
+  alignas(16) static std::array<float, 256> w51;
+  alignas(16) static std::array<float, 2304> w52;
+  alignas(16) static std::array<float, 256> w53;
+  alignas(16) static std::array<float, 131072> w54;
+  alignas(16) static std::array<float, 512> w55;
+  alignas(16) static std::array<float, 4608> w56;
+  alignas(16) static std::array<float, 512> w57;
+  alignas(16) static std::array<float, 262144> w58;
+  alignas(16) static std::array<float, 512> w59;
+  alignas(16) static std::array<float, 4608> w60;
+  alignas(16) static std::array<float, 512> w61;
+  alignas(16) static std::array<float, 262144> w62;
+  alignas(16) static std::array<float, 512> w63;
+  alignas(16) static std::array<float, 4608> w64;
+  alignas(16) static std::array<float, 512> w65;
+  alignas(16) static std::array<float, 262144> w66;
+  alignas(16) static std::array<float, 512> w67;
+  alignas(16) static std::array<float, 4608> w68;
+  alignas(16) static std::array<float, 512> w69;
+  alignas(16) static std::array<float, 262144> w70;
+  alignas(16) static std::array<float, 512> w71;
+  alignas(16) static std::array<float, 4608> w72;
+  alignas(16) static std::array<float, 512> w73;
+  alignas(16) static std::array<float, 262144> w74;
+  alignas(16) static std::array<float, 512> w75;
+  alignas(16) static std::array<float, 4608> w76;
+  alignas(16) static std::array<float, 512> w77;
+  alignas(16) static std::array<float, 524288> w78;
+  alignas(16) static std::array<float, 1024> w79;
+  alignas(16) static std::array<float, 9216> w80;
+  alignas(16) static std::array<float, 1024> w81;
+  alignas(16) static std::array<float, 1048576> w82;
+  alignas(16) static std::array<float, 1024> w83;
+  alignas(16) static std::array<float, 1025024> w84;
+  alignas(16) static std::array<float, 1001> w85;
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.0f, +1.0f), std::ref(rng));
+  std::generate(v0.begin(), v0.end(), std::ref(f32rng));
+  std::generate(v1.begin(), v1.end(), std::ref(f32rng));
+  std::generate(v2.begin(), v2.end(), std::ref(f32rng));
+  std::generate(v3.begin(), v3.end(), std::ref(f32rng));
+  std::generate(v4.begin(), v4.end(), std::ref(f32rng));
+  std::generate(v5.begin(), v5.end(), std::ref(f32rng));
+  std::generate(v6.begin(), v6.end(), std::ref(f32rng));
+  std::generate(v7.begin(), v7.end(), std::ref(f32rng));
+  std::generate(v8.begin(), v8.end(), std::ref(f32rng));
+  std::generate(v9.begin(), v9.end(), std::ref(f32rng));
+  std::generate(v10.begin(), v10.end(), std::ref(f32rng));
+  std::generate(v11.begin(), v11.end(), std::ref(f32rng));
+  std::generate(v12.begin(), v12.end(), std::ref(f32rng));
+  std::generate(v13.begin(), v13.end(), std::ref(f32rng));
+  std::generate(v14.begin(), v14.end(), std::ref(f32rng));
+  std::generate(v15.begin(), v15.end(), std::ref(f32rng));
+  std::generate(v16.begin(), v16.end(), std::ref(f32rng));
+  std::generate(v17.begin(), v17.end(), std::ref(f32rng));
+  std::generate(v18.begin(), v18.end(), std::ref(f32rng));
+  std::generate(v19.begin(), v19.end(), std::ref(f32rng));
+  std::generate(v20.begin(), v20.end(), std::ref(f32rng));
+  std::generate(v21.begin(), v21.end(), std::ref(f32rng));
+  std::generate(v22.begin(), v22.end(), std::ref(f32rng));
+  std::generate(v23.begin(), v23.end(), std::ref(f32rng));
+  std::generate(v24.begin(), v24.end(), std::ref(f32rng));
+  std::generate(v25.begin(), v25.end(), std::ref(f32rng));
+  std::generate(v26.begin(), v26.end(), std::ref(f32rng));
+  std::generate(v27.begin(), v27.end(), std::ref(f32rng));
+  std::generate(v28.begin(), v28.end(), std::ref(f32rng));
+  std::generate(v29.begin(), v29.end(), std::ref(f32rng));
+  std::generate(w30.begin(), w30.end(), std::ref(f32rng));
+  std::generate(w31.begin(), w31.end(), std::ref(f32rng));
+  std::generate(w32.begin(), w32.end(), std::ref(f32rng));
+  std::generate(w33.begin(), w33.end(), std::ref(f32rng));
+  std::fill(w34.begin(), w34.end(), 0.0f);
+  std::generate(w34.begin(), w34.end() - size_t(sparsity * w34.size()), std::ref(f32rng));
+  std::shuffle(w34.begin(), w34.end(), rng);
+  std::generate(w35.begin(), w35.end(), std::ref(f32rng));
+  std::generate(w36.begin(), w36.end(), std::ref(f32rng));
+  std::generate(w37.begin(), w37.end(), std::ref(f32rng));
+  std::fill(w38.begin(), w38.end(), 0.0f);
+  std::generate(w38.begin(), w38.end() - size_t(sparsity * w38.size()), std::ref(f32rng));
+  std::shuffle(w38.begin(), w38.end(), rng);
+  std::generate(w39.begin(), w39.end(), std::ref(f32rng));
+  std::generate(w40.begin(), w40.end(), std::ref(f32rng));
+  std::generate(w41.begin(), w41.end(), std::ref(f32rng));
+  std::fill(w42.begin(), w42.end(), 0.0f);
+  std::generate(w42.begin(), w42.end() - size_t(sparsity * w42.size()), std::ref(f32rng));
+  std::shuffle(w42.begin(), w42.end(), rng);
+  std::generate(w43.begin(), w43.end(), std::ref(f32rng));
+  std::generate(w44.begin(), w44.end(), std::ref(f32rng));
+  std::generate(w45.begin(), w45.end(), std::ref(f32rng));
+  std::fill(w46.begin(), w46.end(), 0.0f);
+  std::generate(w46.begin(), w46.end() - size_t(sparsity * w46.size()), std::ref(f32rng));
+  std::shuffle(w46.begin(), w46.end(), rng);
+  std::generate(w47.begin(), w47.end(), std::ref(f32rng));
+  std::generate(w48.begin(), w48.end(), std::ref(f32rng));
+  std::generate(w49.begin(), w49.end(), std::ref(f32rng));
+  std::fill(w50.begin(), w50.end(), 0.0f);
+  std::generate(w50.begin(), w50.end() - size_t(sparsity * w50.size()), std::ref(f32rng));
+  std::shuffle(w50.begin(), w50.end(), rng);
+  std::generate(w51.begin(), w51.end(), std::ref(f32rng));
+  std::generate(w52.begin(), w52.end(), std::ref(f32rng));
+  std::generate(w53.begin(), w53.end(), std::ref(f32rng));
+  std::fill(w54.begin(), w54.end(), 0.0f);
+  std::generate(w54.begin(), w54.end() - size_t(sparsity * w54.size()), std::ref(f32rng));
+  std::shuffle(w54.begin(), w54.end(), rng);
+  std::generate(w55.begin(), w55.end(), std::ref(f32rng));
+  std::generate(w56.begin(), w56.end(), std::ref(f32rng));
+  std::generate(w57.begin(), w57.end(), std::ref(f32rng));
+  std::fill(w58.begin(), w58.end(), 0.0f);
+  std::generate(w58.begin(), w58.end() - size_t(sparsity * w58.size()), std::ref(f32rng));
+  std::shuffle(w58.begin(), w58.end(), rng);
+  std::generate(w59.begin(), w59.end(), std::ref(f32rng));
+  std::generate(w60.begin(), w60.end(), std::ref(f32rng));
+  std::generate(w61.begin(), w61.end(), std::ref(f32rng));
+  std::fill(w62.begin(), w62.end(), 0.0f);
+  std::generate(w62.begin(), w62.end() - size_t(sparsity * w62.size()), std::ref(f32rng));
+  std::shuffle(w62.begin(), w62.end(), rng);
+  std::generate(w63.begin(), w63.end(), std::ref(f32rng));
+  std::generate(w64.begin(), w64.end(), std::ref(f32rng));
+  std::generate(w65.begin(), w65.end(), std::ref(f32rng));
+  std::fill(w66.begin(), w66.end(), 0.0f);
+  std::generate(w66.begin(), w66.end() - size_t(sparsity * w66.size()), std::ref(f32rng));
+  std::shuffle(w66.begin(), w66.end(), rng);
+  std::generate(w67.begin(), w67.end(), std::ref(f32rng));
+  std::generate(w68.begin(), w68.end(), std::ref(f32rng));
+  std::generate(w69.begin(), w69.end(), std::ref(f32rng));
+  std::fill(w70.begin(), w70.end(), 0.0f);
+  std::generate(w70.begin(), w70.end() - size_t(sparsity * w70.size()), std::ref(f32rng));
+  std::shuffle(w70.begin(), w70.end(), rng);
+  std::generate(w71.begin(), w71.end(), std::ref(f32rng));
+  std::generate(w72.begin(), w72.end(), std::ref(f32rng));
+  std::generate(w73.begin(), w73.end(), std::ref(f32rng));
+  std::fill(w74.begin(), w74.end(), 0.0f);
+  std::generate(w74.begin(), w74.end() - size_t(sparsity * w74.size()), std::ref(f32rng));
+  std::shuffle(w74.begin(), w74.end(), rng);
+  std::generate(w75.begin(), w75.end(), std::ref(f32rng));
+  std::generate(w76.begin(), w76.end(), std::ref(f32rng));
+  std::generate(w77.begin(), w77.end(), std::ref(f32rng));
+  std::fill(w78.begin(), w78.end(), 0.0f);
+  std::generate(w78.begin(), w78.end() - size_t(sparsity * w78.size()), std::ref(f32rng));
+  std::shuffle(w78.begin(), w78.end(), rng);
+  std::generate(w79.begin(), w79.end(), std::ref(f32rng));
+  std::generate(w80.begin(), w80.end(), std::ref(f32rng));
+  std::generate(w81.begin(), w81.end(), std::ref(f32rng));
+  std::fill(w82.begin(), w82.end(), 0.0f);
+  std::generate(w82.begin(), w82.end() - size_t(sparsity * w82.size()), std::ref(f32rng));
+  std::shuffle(w82.begin(), w82.end(), rng);
+  std::generate(w83.begin(), w83.end(), std::ref(f32rng));
+  std::generate(w84.begin(), w84.end(), std::ref(f32rng));
+  std::generate(w85.begin(), w85.end(), std::ref(f32rng));
+
+  ExecutionPlan operators;
+  xnn_status status;
+
+  xnn_operator_t op0 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    2 /* subsampling height */, 2 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    3 /* input channels per group */,
+    32 /* output_channels_per_group */,
+    3 /* input pixel stride */,
+    32 /* output pixel stride */,
+    w30.data(), w31.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    XNN_FLAG_INPUT_NHWC /* flags */,
+    &op0);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #0" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op0, xnn_delete_operator);
+
+  xnn_operator_t op1 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    32 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    32 /* input pixel stride */,
+    32 /* output pixel stride */,
+    w32.data(), w33.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op1);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #1" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op1, xnn_delete_operator);
+
+  xnn_operator_t op2 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    32 /* input channels per group */,
+    64 /* output_channels_per_group */,
+    32 /* input pixel stride */,
+    64 /* output pixel stride */,
+    w34.data(), w35.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op2);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #2" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op2, xnn_delete_operator);
+
+  xnn_operator_t op3 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    2 /* subsampling height */, 2 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    64 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    64 /* input pixel stride */,
+    64 /* output pixel stride */,
+    w36.data(), w37.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op3);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #3" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op3, xnn_delete_operator);
+
+  xnn_operator_t op4 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    64 /* input channels per group */,
+    128 /* output_channels_per_group */,
+    64 /* input pixel stride */,
+    128 /* output pixel stride */,
+    w38.data(), w39.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op4);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #4" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op4, xnn_delete_operator);
+
+  xnn_operator_t op5 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    128 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    128 /* input pixel stride */,
+    128 /* output pixel stride */,
+    w40.data(), w41.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op5);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #5" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op5, xnn_delete_operator);
+
+  xnn_operator_t op6 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    128 /* input channels per group */,
+    128 /* output_channels_per_group */,
+    128 /* input pixel stride */,
+    128 /* output pixel stride */,
+    w42.data(), w43.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op6);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #6" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op6, xnn_delete_operator);
+
+  xnn_operator_t op7 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    2 /* subsampling height */, 2 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    128 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    128 /* input pixel stride */,
+    128 /* output pixel stride */,
+    w44.data(), w45.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op7);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #7" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op7, xnn_delete_operator);
+
+  xnn_operator_t op8 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    128 /* input channels per group */,
+    256 /* output_channels_per_group */,
+    128 /* input pixel stride */,
+    256 /* output pixel stride */,
+    w46.data(), w47.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op8);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #8" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op8, xnn_delete_operator);
+
+  xnn_operator_t op9 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    256 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    256 /* input pixel stride */,
+    256 /* output pixel stride */,
+    w48.data(), w49.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op9);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #9" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op9, xnn_delete_operator);
+
+  xnn_operator_t op10 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    256 /* input channels per group */,
+    256 /* output_channels_per_group */,
+    256 /* input pixel stride */,
+    256 /* output pixel stride */,
+    w50.data(), w51.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op10);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #10" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op10, xnn_delete_operator);
+
+  xnn_operator_t op11 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    2 /* subsampling height */, 2 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    256 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    256 /* input pixel stride */,
+    256 /* output pixel stride */,
+    w52.data(), w53.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op11);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #11" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op11, xnn_delete_operator);
+
+  xnn_operator_t op12 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    256 /* input channels per group */,
+    512 /* output_channels_per_group */,
+    256 /* input pixel stride */,
+    512 /* output pixel stride */,
+    w54.data(), w55.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op12);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #12" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op12, xnn_delete_operator);
+
+  xnn_operator_t op13 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    512 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    512 /* input pixel stride */,
+    512 /* output pixel stride */,
+    w56.data(), w57.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op13);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #13" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op13, xnn_delete_operator);
+
+  xnn_operator_t op14 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    512 /* input channels per group */,
+    512 /* output_channels_per_group */,
+    512 /* input pixel stride */,
+    512 /* output pixel stride */,
+    w58.data(), w59.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op14);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #14" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op14, xnn_delete_operator);
+
+  xnn_operator_t op15 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    512 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    512 /* input pixel stride */,
+    512 /* output pixel stride */,
+    w60.data(), w61.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op15);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #15" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op15, xnn_delete_operator);
+
+  xnn_operator_t op16 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    512 /* input channels per group */,
+    512 /* output_channels_per_group */,
+    512 /* input pixel stride */,
+    512 /* output pixel stride */,
+    w62.data(), w63.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op16);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #16" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op16, xnn_delete_operator);
+
+  xnn_operator_t op17 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    512 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    512 /* input pixel stride */,
+    512 /* output pixel stride */,
+    w64.data(), w65.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op17);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #17" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op17, xnn_delete_operator);
+
+  xnn_operator_t op18 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    512 /* input channels per group */,
+    512 /* output_channels_per_group */,
+    512 /* input pixel stride */,
+    512 /* output pixel stride */,
+    w66.data(), w67.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op18);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #18" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op18, xnn_delete_operator);
+
+  xnn_operator_t op19 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    512 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    512 /* input pixel stride */,
+    512 /* output pixel stride */,
+    w68.data(), w69.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op19);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #19" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op19, xnn_delete_operator);
+
+  xnn_operator_t op20 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    512 /* input channels per group */,
+    512 /* output_channels_per_group */,
+    512 /* input pixel stride */,
+    512 /* output pixel stride */,
+    w70.data(), w71.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op20);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #20" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op20, xnn_delete_operator);
+
+  xnn_operator_t op21 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    512 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    512 /* input pixel stride */,
+    512 /* output pixel stride */,
+    w72.data(), w73.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op21);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #21" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op21, xnn_delete_operator);
+
+  xnn_operator_t op22 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    512 /* input channels per group */,
+    512 /* output_channels_per_group */,
+    512 /* input pixel stride */,
+    512 /* output pixel stride */,
+    w74.data(), w75.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op22);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #22" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op22, xnn_delete_operator);
+
+  xnn_operator_t op23 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    2 /* subsampling height */, 2 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    512 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    512 /* input pixel stride */,
+    512 /* output pixel stride */,
+    w76.data(), w77.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op23);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #23" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op23, xnn_delete_operator);
+
+  xnn_operator_t op24 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    512 /* input channels per group */,
+    1024 /* output_channels_per_group */,
+    512 /* input pixel stride */,
+    1024 /* output pixel stride */,
+    w78.data(), w79.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op24);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #24" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op24, xnn_delete_operator);
+
+  xnn_operator_t op25 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1024 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    1024 /* input pixel stride */,
+    1024 /* output pixel stride */,
+    w80.data(), w81.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op25);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #25" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op25, xnn_delete_operator);
+
+  xnn_operator_t op26 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    1024 /* input channels per group */,
+    1024 /* output_channels_per_group */,
+    1024 /* input pixel stride */,
+    1024 /* output pixel stride */,
+    w82.data(), w83.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op26);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #26" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op26, xnn_delete_operator);
+
+  xnn_operator_t op27 = nullptr;
+  status = xnn_create_global_average_pooling_ncw_f32(
+    1024 /* channels */,
+    -std::numeric_limits<float>::infinity(), std::numeric_limits<float>::infinity(),
+    0 /* flags */,
+    &op27);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #27" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op27, xnn_delete_operator);
+
+  xnn_operator_t op28 = nullptr;
+  status = xnn_create_convolution2d_nhwc_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    1024 /* input channels per group */,
+    1001 /* output_channels_per_group */,
+    1024 /* input pixel stride */,
+    1001 /* output pixel stride */,
+    w84.data(), w85.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op28);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #28" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op28, xnn_delete_operator);
+
+
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op0,
+    1 /* batch size */, 224 /* input height */, 224 /* input width */,
+    v0.data() /* input */, v1.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #0" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op1,
+    1 /* batch size */, 112 /* input height */, 112 /* input width */,
+    v1.data() /* input */, v2.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #1" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op2,
+    1 /* batch size */, 112 /* input height */, 112 /* input width */,
+    v2.data() /* input */, v3.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #2" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op3,
+    1 /* batch size */, 112 /* input height */, 112 /* input width */,
+    v3.data() /* input */, v4.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #3" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op4,
+    1 /* batch size */, 56 /* input height */, 56 /* input width */,
+    v4.data() /* input */, v5.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #4" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op5,
+    1 /* batch size */, 56 /* input height */, 56 /* input width */,
+    v5.data() /* input */, v6.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #5" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op6,
+    1 /* batch size */, 56 /* input height */, 56 /* input width */,
+    v6.data() /* input */, v7.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #6" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op7,
+    1 /* batch size */, 56 /* input height */, 56 /* input width */,
+    v7.data() /* input */, v8.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #7" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op8,
+    1 /* batch size */, 28 /* input height */, 28 /* input width */,
+    v8.data() /* input */, v9.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #8" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op9,
+    1 /* batch size */, 28 /* input height */, 28 /* input width */,
+    v9.data() /* input */, v10.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #9" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op10,
+    1 /* batch size */, 28 /* input height */, 28 /* input width */,
+    v10.data() /* input */, v11.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #10" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op11,
+    1 /* batch size */, 28 /* input height */, 28 /* input width */,
+    v11.data() /* input */, v12.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #11" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op12,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v12.data() /* input */, v13.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #12" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op13,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v13.data() /* input */, v14.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #13" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op14,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v14.data() /* input */, v15.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #14" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op15,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v15.data() /* input */, v16.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #15" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op16,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v16.data() /* input */, v17.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #16" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op17,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v17.data() /* input */, v18.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #17" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op18,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v18.data() /* input */, v19.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #18" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op19,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v19.data() /* input */, v20.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #19" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op20,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v20.data() /* input */, v21.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #20" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op21,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v21.data() /* input */, v22.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #21" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op22,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v22.data() /* input */, v23.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #22" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op23,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v23.data() /* input */, v24.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #23" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op24,
+    1 /* batch size */, 7 /* input height */, 7 /* input width */,
+    v24.data() /* input */, v25.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #24" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op25,
+    1 /* batch size */, 7 /* input height */, 7 /* input width */,
+    v25.data() /* input */, v26.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #25" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op26,
+    1 /* batch size */, 7 /* input height */, 7 /* input width */,
+    v26.data() /* input */, v27.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #26" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_global_average_pooling_ncw_f32(
+    op27,
+    1 /* batch size */, 49 /* width */,
+    v27.data() /* input */, v28.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #27" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nhwc_f32(
+    op28,
+    1 /* batch size */, 1 /* input height */, 1 /* input width */,
+    v28.data() /* input */, v29.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #28" << std::endl;
+    return ExecutionPlan();
+  }
+
+  #pragma clang diagnostic push
+  #pragma clang diagnostic ignored "-Wpessimizing-move"
+  return operators;
+  #pragma clang diagnostic pop
+}
+
+}  // namespace models
diff --git a/models/fp32-sparse-mobilenet-v2.cc b/models/fp32-sparse-mobilenet-v2.cc
new file mode 100644
index 0000000..10b6aba
--- /dev/null
+++ b/models/fp32-sparse-mobilenet-v2.cc
@@ -0,0 +1,2416 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack.h>
+
+#include <array>
+#include <algorithm>
+#include <functional>
+#include <iostream>
+#include <limits>
+#include <random>
+
+#include "models/models.h"
+
+namespace models {
+
+ExecutionPlan FP32SparseMobileNetV2(float sparsity, pthreadpool_t threadpool) {
+  alignas(16) static std::array<float, 150528> v0;
+  alignas(16) static std::array<float, 401408> v1;
+  alignas(16) static std::array<float, 401408> v2;
+  alignas(16) static std::array<float, 200704> v3;
+  alignas(16) static std::array<float, 1204224> v4;
+  alignas(16) static std::array<float, 301056> v5;
+  alignas(16) static std::array<float, 75264> v6;
+  alignas(16) static std::array<float, 451584> v7;
+  alignas(16) static std::array<float, 451584> v8;
+  alignas(16) static std::array<float, 75264> v9;
+  alignas(16) static std::array<float, 75264> v10;
+  alignas(16) static std::array<float, 451584> v11;
+  alignas(16) static std::array<float, 112896> v12;
+  alignas(16) static std::array<float, 25088> v13;
+  alignas(16) static std::array<float, 150528> v14;
+  alignas(16) static std::array<float, 150528> v15;
+  alignas(16) static std::array<float, 25088> v16;
+  alignas(16) static std::array<float, 25088> v17;
+  alignas(16) static std::array<float, 150528> v18;
+  alignas(16) static std::array<float, 150528> v19;
+  alignas(16) static std::array<float, 25088> v20;
+  alignas(16) static std::array<float, 25088> v21;
+  alignas(16) static std::array<float, 150528> v22;
+  alignas(16) static std::array<float, 37632> v23;
+  alignas(16) static std::array<float, 12544> v24;
+  alignas(16) static std::array<float, 75264> v25;
+  alignas(16) static std::array<float, 75264> v26;
+  alignas(16) static std::array<float, 12544> v27;
+  alignas(16) static std::array<float, 12544> v28;
+  alignas(16) static std::array<float, 75264> v29;
+  alignas(16) static std::array<float, 75264> v30;
+  alignas(16) static std::array<float, 12544> v31;
+  alignas(16) static std::array<float, 12544> v32;
+  alignas(16) static std::array<float, 75264> v33;
+  alignas(16) static std::array<float, 75264> v34;
+  alignas(16) static std::array<float, 12544> v35;
+  alignas(16) static std::array<float, 12544> v36;
+  alignas(16) static std::array<float, 75264> v37;
+  alignas(16) static std::array<float, 75264> v38;
+  alignas(16) static std::array<float, 18816> v39;
+  alignas(16) static std::array<float, 112896> v40;
+  alignas(16) static std::array<float, 112896> v41;
+  alignas(16) static std::array<float, 18816> v42;
+  alignas(16) static std::array<float, 18816> v43;
+  alignas(16) static std::array<float, 112896> v44;
+  alignas(16) static std::array<float, 112896> v45;
+  alignas(16) static std::array<float, 18816> v46;
+  alignas(16) static std::array<float, 18816> v47;
+  alignas(16) static std::array<float, 112896> v48;
+  alignas(16) static std::array<float, 28224> v49;
+  alignas(16) static std::array<float, 7840> v50;
+  alignas(16) static std::array<float, 47040> v51;
+  alignas(16) static std::array<float, 47040> v52;
+  alignas(16) static std::array<float, 7840> v53;
+  alignas(16) static std::array<float, 7840> v54;
+  alignas(16) static std::array<float, 47040> v55;
+  alignas(16) static std::array<float, 47040> v56;
+  alignas(16) static std::array<float, 7840> v57;
+  alignas(16) static std::array<float, 7840> v58;
+  alignas(16) static std::array<float, 47040> v59;
+  alignas(16) static std::array<float, 47040> v60;
+  alignas(16) static std::array<float, 15680> v61;
+  alignas(16) static std::array<float, 62720> v62;
+  alignas(16) static std::array<float, 1280> v63;
+  alignas(16) static std::array<float, 1001> v64;
+  alignas(16) static std::array<float, 864> w65;
+  alignas(16) static std::array<float, 32> w66;
+  alignas(16) static std::array<float, 288> w67;
+  alignas(16) static std::array<float, 32> w68;
+  alignas(16) static std::array<float, 512> w69;
+  alignas(16) static std::array<float, 16> w70;
+  alignas(16) static std::array<float, 1536> w71;
+  alignas(16) static std::array<float, 96> w72;
+  alignas(16) static std::array<float, 864> w73;
+  alignas(16) static std::array<float, 96> w74;
+  alignas(16) static std::array<float, 2304> w75;
+  alignas(16) static std::array<float, 24> w76;
+  alignas(16) static std::array<float, 3456> w77;
+  alignas(16) static std::array<float, 144> w78;
+  alignas(16) static std::array<float, 1296> w79;
+  alignas(16) static std::array<float, 144> w80;
+  alignas(16) static std::array<float, 3456> w81;
+  alignas(16) static std::array<float, 24> w82;
+  alignas(16) static std::array<float, 3456> w83;
+  alignas(16) static std::array<float, 144> w84;
+  alignas(16) static std::array<float, 1296> w85;
+  alignas(16) static std::array<float, 144> w86;
+  alignas(16) static std::array<float, 4608> w87;
+  alignas(16) static std::array<float, 32> w88;
+  alignas(16) static std::array<float, 6144> w89;
+  alignas(16) static std::array<float, 192> w90;
+  alignas(16) static std::array<float, 1728> w91;
+  alignas(16) static std::array<float, 192> w92;
+  alignas(16) static std::array<float, 6144> w93;
+  alignas(16) static std::array<float, 32> w94;
+  alignas(16) static std::array<float, 6144> w95;
+  alignas(16) static std::array<float, 192> w96;
+  alignas(16) static std::array<float, 1728> w97;
+  alignas(16) static std::array<float, 192> w98;
+  alignas(16) static std::array<float, 6144> w99;
+  alignas(16) static std::array<float, 32> w100;
+  alignas(16) static std::array<float, 6144> w101;
+  alignas(16) static std::array<float, 192> w102;
+  alignas(16) static std::array<float, 1728> w103;
+  alignas(16) static std::array<float, 192> w104;
+  alignas(16) static std::array<float, 12288> w105;
+  alignas(16) static std::array<float, 64> w106;
+  alignas(16) static std::array<float, 24576> w107;
+  alignas(16) static std::array<float, 384> w108;
+  alignas(16) static std::array<float, 3456> w109;
+  alignas(16) static std::array<float, 384> w110;
+  alignas(16) static std::array<float, 24576> w111;
+  alignas(16) static std::array<float, 64> w112;
+  alignas(16) static std::array<float, 24576> w113;
+  alignas(16) static std::array<float, 384> w114;
+  alignas(16) static std::array<float, 3456> w115;
+  alignas(16) static std::array<float, 384> w116;
+  alignas(16) static std::array<float, 24576> w117;
+  alignas(16) static std::array<float, 64> w118;
+  alignas(16) static std::array<float, 24576> w119;
+  alignas(16) static std::array<float, 384> w120;
+  alignas(16) static std::array<float, 3456> w121;
+  alignas(16) static std::array<float, 384> w122;
+  alignas(16) static std::array<float, 24576> w123;
+  alignas(16) static std::array<float, 64> w124;
+  alignas(16) static std::array<float, 24576> w125;
+  alignas(16) static std::array<float, 384> w126;
+  alignas(16) static std::array<float, 3456> w127;
+  alignas(16) static std::array<float, 384> w128;
+  alignas(16) static std::array<float, 36864> w129;
+  alignas(16) static std::array<float, 96> w130;
+  alignas(16) static std::array<float, 55296> w131;
+  alignas(16) static std::array<float, 576> w132;
+  alignas(16) static std::array<float, 5184> w133;
+  alignas(16) static std::array<float, 576> w134;
+  alignas(16) static std::array<float, 55296> w135;
+  alignas(16) static std::array<float, 96> w136;
+  alignas(16) static std::array<float, 55296> w137;
+  alignas(16) static std::array<float, 576> w138;
+  alignas(16) static std::array<float, 5184> w139;
+  alignas(16) static std::array<float, 576> w140;
+  alignas(16) static std::array<float, 55296> w141;
+  alignas(16) static std::array<float, 96> w142;
+  alignas(16) static std::array<float, 55296> w143;
+  alignas(16) static std::array<float, 576> w144;
+  alignas(16) static std::array<float, 5184> w145;
+  alignas(16) static std::array<float, 576> w146;
+  alignas(16) static std::array<float, 92160> w147;
+  alignas(16) static std::array<float, 160> w148;
+  alignas(16) static std::array<float, 153600> w149;
+  alignas(16) static std::array<float, 960> w150;
+  alignas(16) static std::array<float, 8640> w151;
+  alignas(16) static std::array<float, 960> w152;
+  alignas(16) static std::array<float, 153600> w153;
+  alignas(16) static std::array<float, 160> w154;
+  alignas(16) static std::array<float, 153600> w155;
+  alignas(16) static std::array<float, 960> w156;
+  alignas(16) static std::array<float, 8640> w157;
+  alignas(16) static std::array<float, 960> w158;
+  alignas(16) static std::array<float, 153600> w159;
+  alignas(16) static std::array<float, 160> w160;
+  alignas(16) static std::array<float, 153600> w161;
+  alignas(16) static std::array<float, 960> w162;
+  alignas(16) static std::array<float, 8640> w163;
+  alignas(16) static std::array<float, 960> w164;
+  alignas(16) static std::array<float, 307200> w165;
+  alignas(16) static std::array<float, 320> w166;
+  alignas(16) static std::array<float, 409600> w167;
+  alignas(16) static std::array<float, 1280> w168;
+  alignas(16) static std::array<float, 1281280> w169;
+  alignas(16) static std::array<float, 1001> w170;
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.0f, +1.0f), std::ref(rng));
+  std::generate(v0.begin(), v0.end(), std::ref(f32rng));
+  std::generate(v1.begin(), v1.end(), std::ref(f32rng));
+  std::generate(v2.begin(), v2.end(), std::ref(f32rng));
+  std::generate(v3.begin(), v3.end(), std::ref(f32rng));
+  std::generate(v4.begin(), v4.end(), std::ref(f32rng));
+  std::generate(v5.begin(), v5.end(), std::ref(f32rng));
+  std::generate(v6.begin(), v6.end(), std::ref(f32rng));
+  std::generate(v7.begin(), v7.end(), std::ref(f32rng));
+  std::generate(v8.begin(), v8.end(), std::ref(f32rng));
+  std::generate(v9.begin(), v9.end(), std::ref(f32rng));
+  std::generate(v10.begin(), v10.end(), std::ref(f32rng));
+  std::generate(v11.begin(), v11.end(), std::ref(f32rng));
+  std::generate(v12.begin(), v12.end(), std::ref(f32rng));
+  std::generate(v13.begin(), v13.end(), std::ref(f32rng));
+  std::generate(v14.begin(), v14.end(), std::ref(f32rng));
+  std::generate(v15.begin(), v15.end(), std::ref(f32rng));
+  std::generate(v16.begin(), v16.end(), std::ref(f32rng));
+  std::generate(v17.begin(), v17.end(), std::ref(f32rng));
+  std::generate(v18.begin(), v18.end(), std::ref(f32rng));
+  std::generate(v19.begin(), v19.end(), std::ref(f32rng));
+  std::generate(v20.begin(), v20.end(), std::ref(f32rng));
+  std::generate(v21.begin(), v21.end(), std::ref(f32rng));
+  std::generate(v22.begin(), v22.end(), std::ref(f32rng));
+  std::generate(v23.begin(), v23.end(), std::ref(f32rng));
+  std::generate(v24.begin(), v24.end(), std::ref(f32rng));
+  std::generate(v25.begin(), v25.end(), std::ref(f32rng));
+  std::generate(v26.begin(), v26.end(), std::ref(f32rng));
+  std::generate(v27.begin(), v27.end(), std::ref(f32rng));
+  std::generate(v28.begin(), v28.end(), std::ref(f32rng));
+  std::generate(v29.begin(), v29.end(), std::ref(f32rng));
+  std::generate(v30.begin(), v30.end(), std::ref(f32rng));
+  std::generate(v31.begin(), v31.end(), std::ref(f32rng));
+  std::generate(v32.begin(), v32.end(), std::ref(f32rng));
+  std::generate(v33.begin(), v33.end(), std::ref(f32rng));
+  std::generate(v34.begin(), v34.end(), std::ref(f32rng));
+  std::generate(v35.begin(), v35.end(), std::ref(f32rng));
+  std::generate(v36.begin(), v36.end(), std::ref(f32rng));
+  std::generate(v37.begin(), v37.end(), std::ref(f32rng));
+  std::generate(v38.begin(), v38.end(), std::ref(f32rng));
+  std::generate(v39.begin(), v39.end(), std::ref(f32rng));
+  std::generate(v40.begin(), v40.end(), std::ref(f32rng));
+  std::generate(v41.begin(), v41.end(), std::ref(f32rng));
+  std::generate(v42.begin(), v42.end(), std::ref(f32rng));
+  std::generate(v43.begin(), v43.end(), std::ref(f32rng));
+  std::generate(v44.begin(), v44.end(), std::ref(f32rng));
+  std::generate(v45.begin(), v45.end(), std::ref(f32rng));
+  std::generate(v46.begin(), v46.end(), std::ref(f32rng));
+  std::generate(v47.begin(), v47.end(), std::ref(f32rng));
+  std::generate(v48.begin(), v48.end(), std::ref(f32rng));
+  std::generate(v49.begin(), v49.end(), std::ref(f32rng));
+  std::generate(v50.begin(), v50.end(), std::ref(f32rng));
+  std::generate(v51.begin(), v51.end(), std::ref(f32rng));
+  std::generate(v52.begin(), v52.end(), std::ref(f32rng));
+  std::generate(v53.begin(), v53.end(), std::ref(f32rng));
+  std::generate(v54.begin(), v54.end(), std::ref(f32rng));
+  std::generate(v55.begin(), v55.end(), std::ref(f32rng));
+  std::generate(v56.begin(), v56.end(), std::ref(f32rng));
+  std::generate(v57.begin(), v57.end(), std::ref(f32rng));
+  std::generate(v58.begin(), v58.end(), std::ref(f32rng));
+  std::generate(v59.begin(), v59.end(), std::ref(f32rng));
+  std::generate(v60.begin(), v60.end(), std::ref(f32rng));
+  std::generate(v61.begin(), v61.end(), std::ref(f32rng));
+  std::generate(v62.begin(), v62.end(), std::ref(f32rng));
+  std::generate(v63.begin(), v63.end(), std::ref(f32rng));
+  std::generate(v64.begin(), v64.end(), std::ref(f32rng));
+  std::generate(w65.begin(), w65.end(), std::ref(f32rng));
+  std::generate(w66.begin(), w66.end(), std::ref(f32rng));
+  std::generate(w67.begin(), w67.end(), std::ref(f32rng));
+  std::generate(w68.begin(), w68.end(), std::ref(f32rng));
+  std::fill(w69.begin(), w69.end(), 0.0f);
+  std::generate(w69.begin(), w69.end() - size_t(sparsity * w69.size()), std::ref(f32rng));
+  std::shuffle(w69.begin(), w69.end(), rng);
+  std::generate(w70.begin(), w70.end(), std::ref(f32rng));
+  std::fill(w71.begin(), w71.end(), 0.0f);
+  std::generate(w71.begin(), w71.end() - size_t(sparsity * w71.size()), std::ref(f32rng));
+  std::shuffle(w71.begin(), w71.end(), rng);
+  std::generate(w72.begin(), w72.end(), std::ref(f32rng));
+  std::generate(w73.begin(), w73.end(), std::ref(f32rng));
+  std::generate(w74.begin(), w74.end(), std::ref(f32rng));
+  std::fill(w75.begin(), w75.end(), 0.0f);
+  std::generate(w75.begin(), w75.end() - size_t(sparsity * w75.size()), std::ref(f32rng));
+  std::shuffle(w75.begin(), w75.end(), rng);
+  std::generate(w76.begin(), w76.end(), std::ref(f32rng));
+  std::fill(w77.begin(), w77.end(), 0.0f);
+  std::generate(w77.begin(), w77.end() - size_t(sparsity * w77.size()), std::ref(f32rng));
+  std::shuffle(w77.begin(), w77.end(), rng);
+  std::generate(w78.begin(), w78.end(), std::ref(f32rng));
+  std::generate(w79.begin(), w79.end(), std::ref(f32rng));
+  std::generate(w80.begin(), w80.end(), std::ref(f32rng));
+  std::fill(w81.begin(), w81.end(), 0.0f);
+  std::generate(w81.begin(), w81.end() - size_t(sparsity * w81.size()), std::ref(f32rng));
+  std::shuffle(w81.begin(), w81.end(), rng);
+  std::generate(w82.begin(), w82.end(), std::ref(f32rng));
+  std::fill(w83.begin(), w83.end(), 0.0f);
+  std::generate(w83.begin(), w83.end() - size_t(sparsity * w83.size()), std::ref(f32rng));
+  std::shuffle(w83.begin(), w83.end(), rng);
+  std::generate(w84.begin(), w84.end(), std::ref(f32rng));
+  std::generate(w85.begin(), w85.end(), std::ref(f32rng));
+  std::generate(w86.begin(), w86.end(), std::ref(f32rng));
+  std::fill(w87.begin(), w87.end(), 0.0f);
+  std::generate(w87.begin(), w87.end() - size_t(sparsity * w87.size()), std::ref(f32rng));
+  std::shuffle(w87.begin(), w87.end(), rng);
+  std::generate(w88.begin(), w88.end(), std::ref(f32rng));
+  std::fill(w89.begin(), w89.end(), 0.0f);
+  std::generate(w89.begin(), w89.end() - size_t(sparsity * w89.size()), std::ref(f32rng));
+  std::shuffle(w89.begin(), w89.end(), rng);
+  std::generate(w90.begin(), w90.end(), std::ref(f32rng));
+  std::generate(w91.begin(), w91.end(), std::ref(f32rng));
+  std::generate(w92.begin(), w92.end(), std::ref(f32rng));
+  std::fill(w93.begin(), w93.end(), 0.0f);
+  std::generate(w93.begin(), w93.end() - size_t(sparsity * w93.size()), std::ref(f32rng));
+  std::shuffle(w93.begin(), w93.end(), rng);
+  std::generate(w94.begin(), w94.end(), std::ref(f32rng));
+  std::fill(w95.begin(), w95.end(), 0.0f);
+  std::generate(w95.begin(), w95.end() - size_t(sparsity * w95.size()), std::ref(f32rng));
+  std::shuffle(w95.begin(), w95.end(), rng);
+  std::generate(w96.begin(), w96.end(), std::ref(f32rng));
+  std::generate(w97.begin(), w97.end(), std::ref(f32rng));
+  std::generate(w98.begin(), w98.end(), std::ref(f32rng));
+  std::fill(w99.begin(), w99.end(), 0.0f);
+  std::generate(w99.begin(), w99.end() - size_t(sparsity * w99.size()), std::ref(f32rng));
+  std::shuffle(w99.begin(), w99.end(), rng);
+  std::generate(w100.begin(), w100.end(), std::ref(f32rng));
+  std::fill(w101.begin(), w101.end(), 0.0f);
+  std::generate(w101.begin(), w101.end() - size_t(sparsity * w101.size()), std::ref(f32rng));
+  std::shuffle(w101.begin(), w101.end(), rng);
+  std::generate(w102.begin(), w102.end(), std::ref(f32rng));
+  std::generate(w103.begin(), w103.end(), std::ref(f32rng));
+  std::generate(w104.begin(), w104.end(), std::ref(f32rng));
+  std::fill(w105.begin(), w105.end(), 0.0f);
+  std::generate(w105.begin(), w105.end() - size_t(sparsity * w105.size()), std::ref(f32rng));
+  std::shuffle(w105.begin(), w105.end(), rng);
+  std::generate(w106.begin(), w106.end(), std::ref(f32rng));
+  std::fill(w107.begin(), w107.end(), 0.0f);
+  std::generate(w107.begin(), w107.end() - size_t(sparsity * w107.size()), std::ref(f32rng));
+  std::shuffle(w107.begin(), w107.end(), rng);
+  std::generate(w108.begin(), w108.end(), std::ref(f32rng));
+  std::generate(w109.begin(), w109.end(), std::ref(f32rng));
+  std::generate(w110.begin(), w110.end(), std::ref(f32rng));
+  std::fill(w111.begin(), w111.end(), 0.0f);
+  std::generate(w111.begin(), w111.end() - size_t(sparsity * w111.size()), std::ref(f32rng));
+  std::shuffle(w111.begin(), w111.end(), rng);
+  std::generate(w112.begin(), w112.end(), std::ref(f32rng));
+  std::fill(w113.begin(), w113.end(), 0.0f);
+  std::generate(w113.begin(), w113.end() - size_t(sparsity * w113.size()), std::ref(f32rng));
+  std::shuffle(w113.begin(), w113.end(), rng);
+  std::generate(w114.begin(), w114.end(), std::ref(f32rng));
+  std::generate(w115.begin(), w115.end(), std::ref(f32rng));
+  std::generate(w116.begin(), w116.end(), std::ref(f32rng));
+  std::fill(w117.begin(), w117.end(), 0.0f);
+  std::generate(w117.begin(), w117.end() - size_t(sparsity * w117.size()), std::ref(f32rng));
+  std::shuffle(w117.begin(), w117.end(), rng);
+  std::generate(w118.begin(), w118.end(), std::ref(f32rng));
+  std::fill(w119.begin(), w119.end(), 0.0f);
+  std::generate(w119.begin(), w119.end() - size_t(sparsity * w119.size()), std::ref(f32rng));
+  std::shuffle(w119.begin(), w119.end(), rng);
+  std::generate(w120.begin(), w120.end(), std::ref(f32rng));
+  std::generate(w121.begin(), w121.end(), std::ref(f32rng));
+  std::generate(w122.begin(), w122.end(), std::ref(f32rng));
+  std::fill(w123.begin(), w123.end(), 0.0f);
+  std::generate(w123.begin(), w123.end() - size_t(sparsity * w123.size()), std::ref(f32rng));
+  std::shuffle(w123.begin(), w123.end(), rng);
+  std::generate(w124.begin(), w124.end(), std::ref(f32rng));
+  std::fill(w125.begin(), w125.end(), 0.0f);
+  std::generate(w125.begin(), w125.end() - size_t(sparsity * w125.size()), std::ref(f32rng));
+  std::shuffle(w125.begin(), w125.end(), rng);
+  std::generate(w126.begin(), w126.end(), std::ref(f32rng));
+  std::generate(w127.begin(), w127.end(), std::ref(f32rng));
+  std::generate(w128.begin(), w128.end(), std::ref(f32rng));
+  std::fill(w129.begin(), w129.end(), 0.0f);
+  std::generate(w129.begin(), w129.end() - size_t(sparsity * w129.size()), std::ref(f32rng));
+  std::shuffle(w129.begin(), w129.end(), rng);
+  std::generate(w130.begin(), w130.end(), std::ref(f32rng));
+  std::fill(w131.begin(), w131.end(), 0.0f);
+  std::generate(w131.begin(), w131.end() - size_t(sparsity * w131.size()), std::ref(f32rng));
+  std::shuffle(w131.begin(), w131.end(), rng);
+  std::generate(w132.begin(), w132.end(), std::ref(f32rng));
+  std::generate(w133.begin(), w133.end(), std::ref(f32rng));
+  std::generate(w134.begin(), w134.end(), std::ref(f32rng));
+  std::fill(w135.begin(), w135.end(), 0.0f);
+  std::generate(w135.begin(), w135.end() - size_t(sparsity * w135.size()), std::ref(f32rng));
+  std::shuffle(w135.begin(), w135.end(), rng);
+  std::generate(w136.begin(), w136.end(), std::ref(f32rng));
+  std::fill(w137.begin(), w137.end(), 0.0f);
+  std::generate(w137.begin(), w137.end() - size_t(sparsity * w137.size()), std::ref(f32rng));
+  std::shuffle(w137.begin(), w137.end(), rng);
+  std::generate(w138.begin(), w138.end(), std::ref(f32rng));
+  std::generate(w139.begin(), w139.end(), std::ref(f32rng));
+  std::generate(w140.begin(), w140.end(), std::ref(f32rng));
+  std::fill(w141.begin(), w141.end(), 0.0f);
+  std::generate(w141.begin(), w141.end() - size_t(sparsity * w141.size()), std::ref(f32rng));
+  std::shuffle(w141.begin(), w141.end(), rng);
+  std::generate(w142.begin(), w142.end(), std::ref(f32rng));
+  std::fill(w143.begin(), w143.end(), 0.0f);
+  std::generate(w143.begin(), w143.end() - size_t(sparsity * w143.size()), std::ref(f32rng));
+  std::shuffle(w143.begin(), w143.end(), rng);
+  std::generate(w144.begin(), w144.end(), std::ref(f32rng));
+  std::generate(w145.begin(), w145.end(), std::ref(f32rng));
+  std::generate(w146.begin(), w146.end(), std::ref(f32rng));
+  std::fill(w147.begin(), w147.end(), 0.0f);
+  std::generate(w147.begin(), w147.end() - size_t(sparsity * w147.size()), std::ref(f32rng));
+  std::shuffle(w147.begin(), w147.end(), rng);
+  std::generate(w148.begin(), w148.end(), std::ref(f32rng));
+  std::fill(w149.begin(), w149.end(), 0.0f);
+  std::generate(w149.begin(), w149.end() - size_t(sparsity * w149.size()), std::ref(f32rng));
+  std::shuffle(w149.begin(), w149.end(), rng);
+  std::generate(w150.begin(), w150.end(), std::ref(f32rng));
+  std::generate(w151.begin(), w151.end(), std::ref(f32rng));
+  std::generate(w152.begin(), w152.end(), std::ref(f32rng));
+  std::fill(w153.begin(), w153.end(), 0.0f);
+  std::generate(w153.begin(), w153.end() - size_t(sparsity * w153.size()), std::ref(f32rng));
+  std::shuffle(w153.begin(), w153.end(), rng);
+  std::generate(w154.begin(), w154.end(), std::ref(f32rng));
+  std::fill(w155.begin(), w155.end(), 0.0f);
+  std::generate(w155.begin(), w155.end() - size_t(sparsity * w155.size()), std::ref(f32rng));
+  std::shuffle(w155.begin(), w155.end(), rng);
+  std::generate(w156.begin(), w156.end(), std::ref(f32rng));
+  std::generate(w157.begin(), w157.end(), std::ref(f32rng));
+  std::generate(w158.begin(), w158.end(), std::ref(f32rng));
+  std::fill(w159.begin(), w159.end(), 0.0f);
+  std::generate(w159.begin(), w159.end() - size_t(sparsity * w159.size()), std::ref(f32rng));
+  std::shuffle(w159.begin(), w159.end(), rng);
+  std::generate(w160.begin(), w160.end(), std::ref(f32rng));
+  std::fill(w161.begin(), w161.end(), 0.0f);
+  std::generate(w161.begin(), w161.end() - size_t(sparsity * w161.size()), std::ref(f32rng));
+  std::shuffle(w161.begin(), w161.end(), rng);
+  std::generate(w162.begin(), w162.end(), std::ref(f32rng));
+  std::generate(w163.begin(), w163.end(), std::ref(f32rng));
+  std::generate(w164.begin(), w164.end(), std::ref(f32rng));
+  std::fill(w165.begin(), w165.end(), 0.0f);
+  std::generate(w165.begin(), w165.end() - size_t(sparsity * w165.size()), std::ref(f32rng));
+  std::shuffle(w165.begin(), w165.end(), rng);
+  std::generate(w166.begin(), w166.end(), std::ref(f32rng));
+  std::fill(w167.begin(), w167.end(), 0.0f);
+  std::generate(w167.begin(), w167.end() - size_t(sparsity * w167.size()), std::ref(f32rng));
+  std::shuffle(w167.begin(), w167.end(), rng);
+  std::generate(w168.begin(), w168.end(), std::ref(f32rng));
+  std::fill(w169.begin(), w169.end(), 0.0f);
+  std::generate(w169.begin(), w169.end() - size_t(sparsity * w169.size()), std::ref(f32rng));
+  std::shuffle(w169.begin(), w169.end(), rng);
+  std::generate(w170.begin(), w170.end(), std::ref(f32rng));
+
+  ExecutionPlan operators;
+  xnn_status status;
+
+  xnn_operator_t op0 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    2 /* subsampling height */, 2 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    3 /* input channels per group */,
+    32 /* output_channels_per_group */,
+    3 /* input pixel stride */,
+    32 /* output pixel stride */,
+    w65.data(), w66.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    XNN_FLAG_INPUT_NHWC /* flags */,
+    &op0);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #0" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op0, xnn_delete_operator);
+
+  xnn_operator_t op1 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    32 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    32 /* input pixel stride */,
+    32 /* output pixel stride */,
+    w67.data(), w68.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op1);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #1" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op1, xnn_delete_operator);
+
+  xnn_operator_t op2 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    32 /* input channels per group */,
+    16 /* output_channels_per_group */,
+    32 /* input pixel stride */,
+    16 /* output pixel stride */,
+    w69.data(), w70.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op2);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #2" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op2, xnn_delete_operator);
+
+  xnn_operator_t op3 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    16 /* input channels per group */,
+    96 /* output_channels_per_group */,
+    16 /* input pixel stride */,
+    96 /* output pixel stride */,
+    w71.data(), w72.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op3);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #3" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op3, xnn_delete_operator);
+
+  xnn_operator_t op4 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    2 /* subsampling height */, 2 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    96 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    96 /* input pixel stride */,
+    96 /* output pixel stride */,
+    w73.data(), w74.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op4);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #4" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op4, xnn_delete_operator);
+
+  xnn_operator_t op5 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    96 /* input channels per group */,
+    24 /* output_channels_per_group */,
+    96 /* input pixel stride */,
+    24 /* output pixel stride */,
+    w75.data(), w76.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op5);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #5" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op5, xnn_delete_operator);
+
+  xnn_operator_t op6 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    24 /* input channels per group */,
+    144 /* output_channels_per_group */,
+    24 /* input pixel stride */,
+    144 /* output pixel stride */,
+    w77.data(), w78.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op6);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #6" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op6, xnn_delete_operator);
+
+  xnn_operator_t op7 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    144 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    144 /* input pixel stride */,
+    144 /* output pixel stride */,
+    w79.data(), w80.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op7);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #7" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op7, xnn_delete_operator);
+
+  xnn_operator_t op8 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    144 /* input channels per group */,
+    24 /* output_channels_per_group */,
+    144 /* input pixel stride */,
+    24 /* output pixel stride */,
+    w81.data(), w82.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op8);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #8" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op8, xnn_delete_operator);
+
+  xnn_operator_t op9 = nullptr;
+  status = xnn_create_add_nd_f32(
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op9);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #9" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op9, xnn_delete_operator);
+
+  xnn_operator_t op10 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    24 /* input channels per group */,
+    144 /* output_channels_per_group */,
+    24 /* input pixel stride */,
+    144 /* output pixel stride */,
+    w83.data(), w84.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op10);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #10" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op10, xnn_delete_operator);
+
+  xnn_operator_t op11 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    2 /* subsampling height */, 2 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    144 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    144 /* input pixel stride */,
+    144 /* output pixel stride */,
+    w85.data(), w86.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op11);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #11" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op11, xnn_delete_operator);
+
+  xnn_operator_t op12 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    144 /* input channels per group */,
+    32 /* output_channels_per_group */,
+    144 /* input pixel stride */,
+    32 /* output pixel stride */,
+    w87.data(), w88.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op12);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #12" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op12, xnn_delete_operator);
+
+  xnn_operator_t op13 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    32 /* input channels per group */,
+    192 /* output_channels_per_group */,
+    32 /* input pixel stride */,
+    192 /* output pixel stride */,
+    w89.data(), w90.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op13);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #13" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op13, xnn_delete_operator);
+
+  xnn_operator_t op14 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    192 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    192 /* input pixel stride */,
+    192 /* output pixel stride */,
+    w91.data(), w92.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op14);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #14" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op14, xnn_delete_operator);
+
+  xnn_operator_t op15 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    192 /* input channels per group */,
+    32 /* output_channels_per_group */,
+    192 /* input pixel stride */,
+    32 /* output pixel stride */,
+    w93.data(), w94.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op15);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #15" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op15, xnn_delete_operator);
+
+  xnn_operator_t op16 = nullptr;
+  status = xnn_create_add_nd_f32(
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op16);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #16" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op16, xnn_delete_operator);
+
+  xnn_operator_t op17 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    32 /* input channels per group */,
+    192 /* output_channels_per_group */,
+    32 /* input pixel stride */,
+    192 /* output pixel stride */,
+    w95.data(), w96.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op17);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #17" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op17, xnn_delete_operator);
+
+  xnn_operator_t op18 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    192 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    192 /* input pixel stride */,
+    192 /* output pixel stride */,
+    w97.data(), w98.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op18);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #18" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op18, xnn_delete_operator);
+
+  xnn_operator_t op19 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    192 /* input channels per group */,
+    32 /* output_channels_per_group */,
+    192 /* input pixel stride */,
+    32 /* output pixel stride */,
+    w99.data(), w100.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op19);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #19" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op19, xnn_delete_operator);
+
+  xnn_operator_t op20 = nullptr;
+  status = xnn_create_add_nd_f32(
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op20);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #20" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op20, xnn_delete_operator);
+
+  xnn_operator_t op21 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    32 /* input channels per group */,
+    192 /* output_channels_per_group */,
+    32 /* input pixel stride */,
+    192 /* output pixel stride */,
+    w101.data(), w102.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op21);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #21" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op21, xnn_delete_operator);
+
+  xnn_operator_t op22 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    2 /* subsampling height */, 2 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    192 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    192 /* input pixel stride */,
+    192 /* output pixel stride */,
+    w103.data(), w104.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op22);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #22" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op22, xnn_delete_operator);
+
+  xnn_operator_t op23 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    192 /* input channels per group */,
+    64 /* output_channels_per_group */,
+    192 /* input pixel stride */,
+    64 /* output pixel stride */,
+    w105.data(), w106.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op23);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #23" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op23, xnn_delete_operator);
+
+  xnn_operator_t op24 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    64 /* input channels per group */,
+    384 /* output_channels_per_group */,
+    64 /* input pixel stride */,
+    384 /* output pixel stride */,
+    w107.data(), w108.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op24);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #24" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op24, xnn_delete_operator);
+
+  xnn_operator_t op25 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    384 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    384 /* input pixel stride */,
+    384 /* output pixel stride */,
+    w109.data(), w110.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op25);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #25" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op25, xnn_delete_operator);
+
+  xnn_operator_t op26 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    384 /* input channels per group */,
+    64 /* output_channels_per_group */,
+    384 /* input pixel stride */,
+    64 /* output pixel stride */,
+    w111.data(), w112.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op26);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #26" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op26, xnn_delete_operator);
+
+  xnn_operator_t op27 = nullptr;
+  status = xnn_create_add_nd_f32(
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op27);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #27" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op27, xnn_delete_operator);
+
+  xnn_operator_t op28 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    64 /* input channels per group */,
+    384 /* output_channels_per_group */,
+    64 /* input pixel stride */,
+    384 /* output pixel stride */,
+    w113.data(), w114.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op28);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #28" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op28, xnn_delete_operator);
+
+  xnn_operator_t op29 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    384 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    384 /* input pixel stride */,
+    384 /* output pixel stride */,
+    w115.data(), w116.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op29);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #29" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op29, xnn_delete_operator);
+
+  xnn_operator_t op30 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    384 /* input channels per group */,
+    64 /* output_channels_per_group */,
+    384 /* input pixel stride */,
+    64 /* output pixel stride */,
+    w117.data(), w118.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op30);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #30" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op30, xnn_delete_operator);
+
+  xnn_operator_t op31 = nullptr;
+  status = xnn_create_add_nd_f32(
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op31);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #31" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op31, xnn_delete_operator);
+
+  xnn_operator_t op32 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    64 /* input channels per group */,
+    384 /* output_channels_per_group */,
+    64 /* input pixel stride */,
+    384 /* output pixel stride */,
+    w119.data(), w120.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op32);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #32" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op32, xnn_delete_operator);
+
+  xnn_operator_t op33 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    384 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    384 /* input pixel stride */,
+    384 /* output pixel stride */,
+    w121.data(), w122.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op33);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #33" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op33, xnn_delete_operator);
+
+  xnn_operator_t op34 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    384 /* input channels per group */,
+    64 /* output_channels_per_group */,
+    384 /* input pixel stride */,
+    64 /* output pixel stride */,
+    w123.data(), w124.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op34);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #34" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op34, xnn_delete_operator);
+
+  xnn_operator_t op35 = nullptr;
+  status = xnn_create_add_nd_f32(
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op35);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #35" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op35, xnn_delete_operator);
+
+  xnn_operator_t op36 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    64 /* input channels per group */,
+    384 /* output_channels_per_group */,
+    64 /* input pixel stride */,
+    384 /* output pixel stride */,
+    w125.data(), w126.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op36);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #36" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op36, xnn_delete_operator);
+
+  xnn_operator_t op37 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    384 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    384 /* input pixel stride */,
+    384 /* output pixel stride */,
+    w127.data(), w128.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op37);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #37" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op37, xnn_delete_operator);
+
+  xnn_operator_t op38 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    384 /* input channels per group */,
+    96 /* output_channels_per_group */,
+    384 /* input pixel stride */,
+    96 /* output pixel stride */,
+    w129.data(), w130.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op38);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #38" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op38, xnn_delete_operator);
+
+  xnn_operator_t op39 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    96 /* input channels per group */,
+    576 /* output_channels_per_group */,
+    96 /* input pixel stride */,
+    576 /* output pixel stride */,
+    w131.data(), w132.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op39);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #39" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op39, xnn_delete_operator);
+
+  xnn_operator_t op40 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    576 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    576 /* input pixel stride */,
+    576 /* output pixel stride */,
+    w133.data(), w134.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op40);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #40" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op40, xnn_delete_operator);
+
+  xnn_operator_t op41 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    576 /* input channels per group */,
+    96 /* output_channels_per_group */,
+    576 /* input pixel stride */,
+    96 /* output pixel stride */,
+    w135.data(), w136.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op41);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #41" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op41, xnn_delete_operator);
+
+  xnn_operator_t op42 = nullptr;
+  status = xnn_create_add_nd_f32(
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op42);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #42" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op42, xnn_delete_operator);
+
+  xnn_operator_t op43 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    96 /* input channels per group */,
+    576 /* output_channels_per_group */,
+    96 /* input pixel stride */,
+    576 /* output pixel stride */,
+    w137.data(), w138.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op43);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #43" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op43, xnn_delete_operator);
+
+  xnn_operator_t op44 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    576 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    576 /* input pixel stride */,
+    576 /* output pixel stride */,
+    w139.data(), w140.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op44);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #44" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op44, xnn_delete_operator);
+
+  xnn_operator_t op45 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    576 /* input channels per group */,
+    96 /* output_channels_per_group */,
+    576 /* input pixel stride */,
+    96 /* output pixel stride */,
+    w141.data(), w142.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op45);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #45" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op45, xnn_delete_operator);
+
+  xnn_operator_t op46 = nullptr;
+  status = xnn_create_add_nd_f32(
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op46);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #46" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op46, xnn_delete_operator);
+
+  xnn_operator_t op47 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    96 /* input channels per group */,
+    576 /* output_channels_per_group */,
+    96 /* input pixel stride */,
+    576 /* output pixel stride */,
+    w143.data(), w144.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op47);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #47" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op47, xnn_delete_operator);
+
+  xnn_operator_t op48 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    2 /* subsampling height */, 2 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    576 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    576 /* input pixel stride */,
+    576 /* output pixel stride */,
+    w145.data(), w146.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op48);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #48" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op48, xnn_delete_operator);
+
+  xnn_operator_t op49 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    576 /* input channels per group */,
+    160 /* output_channels_per_group */,
+    576 /* input pixel stride */,
+    160 /* output pixel stride */,
+    w147.data(), w148.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op49);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #49" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op49, xnn_delete_operator);
+
+  xnn_operator_t op50 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    160 /* input channels per group */,
+    960 /* output_channels_per_group */,
+    160 /* input pixel stride */,
+    960 /* output pixel stride */,
+    w149.data(), w150.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op50);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #50" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op50, xnn_delete_operator);
+
+  xnn_operator_t op51 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    960 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    960 /* input pixel stride */,
+    960 /* output pixel stride */,
+    w151.data(), w152.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op51);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #51" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op51, xnn_delete_operator);
+
+  xnn_operator_t op52 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    960 /* input channels per group */,
+    160 /* output_channels_per_group */,
+    960 /* input pixel stride */,
+    160 /* output pixel stride */,
+    w153.data(), w154.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op52);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #52" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op52, xnn_delete_operator);
+
+  xnn_operator_t op53 = nullptr;
+  status = xnn_create_add_nd_f32(
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op53);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #53" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op53, xnn_delete_operator);
+
+  xnn_operator_t op54 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    160 /* input channels per group */,
+    960 /* output_channels_per_group */,
+    160 /* input pixel stride */,
+    960 /* output pixel stride */,
+    w155.data(), w156.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op54);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #54" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op54, xnn_delete_operator);
+
+  xnn_operator_t op55 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    960 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    960 /* input pixel stride */,
+    960 /* output pixel stride */,
+    w157.data(), w158.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op55);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #55" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op55, xnn_delete_operator);
+
+  xnn_operator_t op56 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    960 /* input channels per group */,
+    160 /* output_channels_per_group */,
+    960 /* input pixel stride */,
+    160 /* output pixel stride */,
+    w159.data(), w160.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op56);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #56" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op56, xnn_delete_operator);
+
+  xnn_operator_t op57 = nullptr;
+  status = xnn_create_add_nd_f32(
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op57);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #57" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op57, xnn_delete_operator);
+
+  xnn_operator_t op58 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    160 /* input channels per group */,
+    960 /* output_channels_per_group */,
+    160 /* input pixel stride */,
+    960 /* output pixel stride */,
+    w161.data(), w162.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op58);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #58" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op58, xnn_delete_operator);
+
+  xnn_operator_t op59 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    960 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    960 /* input pixel stride */,
+    960 /* output pixel stride */,
+    w163.data(), w164.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op59);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #59" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op59, xnn_delete_operator);
+
+  xnn_operator_t op60 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    960 /* input channels per group */,
+    320 /* output_channels_per_group */,
+    960 /* input pixel stride */,
+    320 /* output pixel stride */,
+    w165.data(), w166.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op60);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #60" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op60, xnn_delete_operator);
+
+  xnn_operator_t op61 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    320 /* input channels per group */,
+    1280 /* output_channels_per_group */,
+    320 /* input pixel stride */,
+    1280 /* output pixel stride */,
+    w167.data(), w168.data(),
+    0.0f /* output min */, 6.0f /* output max */,
+    0 /* flags */,
+    &op61);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #61" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op61, xnn_delete_operator);
+
+  xnn_operator_t op62 = nullptr;
+  status = xnn_create_global_average_pooling_ncw_f32(
+    1280 /* channels */,
+    -std::numeric_limits<float>::infinity(), std::numeric_limits<float>::infinity(),
+    0 /* flags */,
+    &op62);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #62" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op62, xnn_delete_operator);
+
+  xnn_operator_t op63 = nullptr;
+  status = xnn_create_convolution2d_nhwc_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    1280 /* input channels per group */,
+    1001 /* output_channels_per_group */,
+    1280 /* input pixel stride */,
+    1001 /* output pixel stride */,
+    w169.data(), w170.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op63);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #63" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op63, xnn_delete_operator);
+
+
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op0,
+    1 /* batch size */, 224 /* input height */, 224 /* input width */,
+    v0.data() /* input */, v1.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #0" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op1,
+    1 /* batch size */, 112 /* input height */, 112 /* input width */,
+    v1.data() /* input */, v2.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #1" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op2,
+    1 /* batch size */, 112 /* input height */, 112 /* input width */,
+    v2.data() /* input */, v3.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #2" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op3,
+    1 /* batch size */, 112 /* input height */, 112 /* input width */,
+    v3.data() /* input */, v4.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #3" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op4,
+    1 /* batch size */, 112 /* input height */, 112 /* input width */,
+    v4.data() /* input */, v5.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #4" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op5,
+    1 /* batch size */, 56 /* input height */, 56 /* input width */,
+    v5.data() /* input */, v6.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #5" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op6,
+    1 /* batch size */, 56 /* input height */, 56 /* input width */,
+    v6.data() /* input */, v7.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #6" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op7,
+    1 /* batch size */, 56 /* input height */, 56 /* input width */,
+    v7.data() /* input */, v8.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #7" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op8,
+    1 /* batch size */, 56 /* input height */, 56 /* input width */,
+    v8.data() /* input */, v9.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #8" << std::endl;
+    return ExecutionPlan();
+  }
+
+  {
+    const size_t a_shape[] = { 1, 24, 56, 56 };
+    const size_t b_shape[] = { 1, 24, 56, 56 };
+    status = xnn_setup_add_nd_f32(
+      op9,
+      4, a_shape, 4, b_shape,
+      v9.data() /* a */, v6.data() /* b */, v10.data() /* output */,
+      threadpool /* threadpool */);
+  }
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #9" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op10,
+    1 /* batch size */, 56 /* input height */, 56 /* input width */,
+    v10.data() /* input */, v11.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #10" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op11,
+    1 /* batch size */, 56 /* input height */, 56 /* input width */,
+    v11.data() /* input */, v12.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #11" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op12,
+    1 /* batch size */, 28 /* input height */, 28 /* input width */,
+    v12.data() /* input */, v13.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #12" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op13,
+    1 /* batch size */, 28 /* input height */, 28 /* input width */,
+    v13.data() /* input */, v14.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #13" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op14,
+    1 /* batch size */, 28 /* input height */, 28 /* input width */,
+    v14.data() /* input */, v15.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #14" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op15,
+    1 /* batch size */, 28 /* input height */, 28 /* input width */,
+    v15.data() /* input */, v16.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #15" << std::endl;
+    return ExecutionPlan();
+  }
+
+  {
+    const size_t a_shape[] = { 1, 32, 28, 28 };
+    const size_t b_shape[] = { 1, 32, 28, 28 };
+    status = xnn_setup_add_nd_f32(
+      op16,
+      4, a_shape, 4, b_shape,
+      v16.data() /* a */, v13.data() /* b */, v17.data() /* output */,
+      threadpool /* threadpool */);
+  }
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #16" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op17,
+    1 /* batch size */, 28 /* input height */, 28 /* input width */,
+    v17.data() /* input */, v18.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #17" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op18,
+    1 /* batch size */, 28 /* input height */, 28 /* input width */,
+    v18.data() /* input */, v19.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #18" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op19,
+    1 /* batch size */, 28 /* input height */, 28 /* input width */,
+    v19.data() /* input */, v20.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #19" << std::endl;
+    return ExecutionPlan();
+  }
+
+  {
+    const size_t a_shape[] = { 1, 32, 28, 28 };
+    const size_t b_shape[] = { 1, 32, 28, 28 };
+    status = xnn_setup_add_nd_f32(
+      op20,
+      4, a_shape, 4, b_shape,
+      v20.data() /* a */, v17.data() /* b */, v21.data() /* output */,
+      threadpool /* threadpool */);
+  }
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #20" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op21,
+    1 /* batch size */, 28 /* input height */, 28 /* input width */,
+    v21.data() /* input */, v22.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #21" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op22,
+    1 /* batch size */, 28 /* input height */, 28 /* input width */,
+    v22.data() /* input */, v23.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #22" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op23,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v23.data() /* input */, v24.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #23" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op24,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v24.data() /* input */, v25.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #24" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op25,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v25.data() /* input */, v26.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #25" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op26,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v26.data() /* input */, v27.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #26" << std::endl;
+    return ExecutionPlan();
+  }
+
+  {
+    const size_t a_shape[] = { 1, 64, 14, 14 };
+    const size_t b_shape[] = { 1, 64, 14, 14 };
+    status = xnn_setup_add_nd_f32(
+      op27,
+      4, a_shape, 4, b_shape,
+      v27.data() /* a */, v24.data() /* b */, v28.data() /* output */,
+      threadpool /* threadpool */);
+  }
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #27" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op28,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v28.data() /* input */, v29.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #28" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op29,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v29.data() /* input */, v30.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #29" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op30,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v30.data() /* input */, v31.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #30" << std::endl;
+    return ExecutionPlan();
+  }
+
+  {
+    const size_t a_shape[] = { 1, 64, 14, 14 };
+    const size_t b_shape[] = { 1, 64, 14, 14 };
+    status = xnn_setup_add_nd_f32(
+      op31,
+      4, a_shape, 4, b_shape,
+      v31.data() /* a */, v28.data() /* b */, v32.data() /* output */,
+      threadpool /* threadpool */);
+  }
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #31" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op32,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v32.data() /* input */, v33.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #32" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op33,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v33.data() /* input */, v34.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #33" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op34,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v34.data() /* input */, v35.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #34" << std::endl;
+    return ExecutionPlan();
+  }
+
+  {
+    const size_t a_shape[] = { 1, 64, 14, 14 };
+    const size_t b_shape[] = { 1, 64, 14, 14 };
+    status = xnn_setup_add_nd_f32(
+      op35,
+      4, a_shape, 4, b_shape,
+      v35.data() /* a */, v32.data() /* b */, v36.data() /* output */,
+      threadpool /* threadpool */);
+  }
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #35" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op36,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v36.data() /* input */, v37.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #36" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op37,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v37.data() /* input */, v38.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #37" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op38,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v38.data() /* input */, v39.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #38" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op39,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v39.data() /* input */, v40.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #39" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op40,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v40.data() /* input */, v41.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #40" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op41,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v41.data() /* input */, v42.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #41" << std::endl;
+    return ExecutionPlan();
+  }
+
+  {
+    const size_t a_shape[] = { 1, 96, 14, 14 };
+    const size_t b_shape[] = { 1, 96, 14, 14 };
+    status = xnn_setup_add_nd_f32(
+      op42,
+      4, a_shape, 4, b_shape,
+      v42.data() /* a */, v39.data() /* b */, v43.data() /* output */,
+      threadpool /* threadpool */);
+  }
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #42" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op43,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v43.data() /* input */, v44.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #43" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op44,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v44.data() /* input */, v45.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #44" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op45,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v45.data() /* input */, v46.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #45" << std::endl;
+    return ExecutionPlan();
+  }
+
+  {
+    const size_t a_shape[] = { 1, 96, 14, 14 };
+    const size_t b_shape[] = { 1, 96, 14, 14 };
+    status = xnn_setup_add_nd_f32(
+      op46,
+      4, a_shape, 4, b_shape,
+      v46.data() /* a */, v43.data() /* b */, v47.data() /* output */,
+      threadpool /* threadpool */);
+  }
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #46" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op47,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v47.data() /* input */, v48.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #47" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op48,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v48.data() /* input */, v49.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #48" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op49,
+    1 /* batch size */, 7 /* input height */, 7 /* input width */,
+    v49.data() /* input */, v50.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #49" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op50,
+    1 /* batch size */, 7 /* input height */, 7 /* input width */,
+    v50.data() /* input */, v51.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #50" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op51,
+    1 /* batch size */, 7 /* input height */, 7 /* input width */,
+    v51.data() /* input */, v52.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #51" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op52,
+    1 /* batch size */, 7 /* input height */, 7 /* input width */,
+    v52.data() /* input */, v53.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #52" << std::endl;
+    return ExecutionPlan();
+  }
+
+  {
+    const size_t a_shape[] = { 1, 160, 7, 7 };
+    const size_t b_shape[] = { 1, 160, 7, 7 };
+    status = xnn_setup_add_nd_f32(
+      op53,
+      4, a_shape, 4, b_shape,
+      v53.data() /* a */, v50.data() /* b */, v54.data() /* output */,
+      threadpool /* threadpool */);
+  }
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #53" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op54,
+    1 /* batch size */, 7 /* input height */, 7 /* input width */,
+    v54.data() /* input */, v55.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #54" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op55,
+    1 /* batch size */, 7 /* input height */, 7 /* input width */,
+    v55.data() /* input */, v56.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #55" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op56,
+    1 /* batch size */, 7 /* input height */, 7 /* input width */,
+    v56.data() /* input */, v57.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #56" << std::endl;
+    return ExecutionPlan();
+  }
+
+  {
+    const size_t a_shape[] = { 1, 160, 7, 7 };
+    const size_t b_shape[] = { 1, 160, 7, 7 };
+    status = xnn_setup_add_nd_f32(
+      op57,
+      4, a_shape, 4, b_shape,
+      v57.data() /* a */, v54.data() /* b */, v58.data() /* output */,
+      threadpool /* threadpool */);
+  }
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #57" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op58,
+    1 /* batch size */, 7 /* input height */, 7 /* input width */,
+    v58.data() /* input */, v59.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #58" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op59,
+    1 /* batch size */, 7 /* input height */, 7 /* input width */,
+    v59.data() /* input */, v60.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #59" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op60,
+    1 /* batch size */, 7 /* input height */, 7 /* input width */,
+    v60.data() /* input */, v61.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #60" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op61,
+    1 /* batch size */, 7 /* input height */, 7 /* input width */,
+    v61.data() /* input */, v62.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #61" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_global_average_pooling_ncw_f32(
+    op62,
+    1 /* batch size */, 49 /* width */,
+    v62.data() /* input */, v63.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #62" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nhwc_f32(
+    op63,
+    1 /* batch size */, 1 /* input height */, 1 /* input width */,
+    v63.data() /* input */, v64.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #63" << std::endl;
+    return ExecutionPlan();
+  }
+
+  #pragma clang diagnostic push
+  #pragma clang diagnostic ignored "-Wpessimizing-move"
+  return operators;
+  #pragma clang diagnostic pop
+}
+
+}  // namespace models
diff --git a/models/fp32-sparse-mobilenet-v3-large.cc b/models/fp32-sparse-mobilenet-v3-large.cc
new file mode 100644
index 0000000..06eecf3
--- /dev/null
+++ b/models/fp32-sparse-mobilenet-v3-large.cc
@@ -0,0 +1,3813 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack.h>
+
+#include <array>
+#include <algorithm>
+#include <functional>
+#include <iostream>
+#include <limits>
+#include <random>
+
+#include "models/models.h"
+
+namespace models {
+
+ExecutionPlan FP32SparseMobileNetV3Large(float sparsity, pthreadpool_t threadpool) {
+  alignas(16) static std::array<float, 150528> v0;
+  alignas(16) static std::array<float, 200704> v1;
+  alignas(16) static std::array<float, 200704> v2;
+  alignas(16) static std::array<float, 200704> v3;
+  alignas(16) static std::array<float, 200704> v4;
+  alignas(16) static std::array<float, 200704> v5;
+  alignas(16) static std::array<float, 802816> v6;
+  alignas(16) static std::array<float, 200704> v7;
+  alignas(16) static std::array<float, 75264> v8;
+  alignas(16) static std::array<float, 225792> v9;
+  alignas(16) static std::array<float, 225792> v10;
+  alignas(16) static std::array<float, 75264> v11;
+  alignas(16) static std::array<float, 75264> v12;
+  alignas(16) static std::array<float, 225792> v13;
+  alignas(16) static std::array<float, 56448> v14;
+  alignas(16) static std::array<float, 72> v15;
+  alignas(16) static std::array<float, 24> v16;
+  alignas(16) static std::array<float, 72> v17;
+  alignas(16) static std::array<float, 56448> v18;
+  alignas(16) static std::array<float, 31360> v19;
+  alignas(16) static std::array<float, 94080> v20;
+  alignas(16) static std::array<float, 94080> v21;
+  alignas(16) static std::array<float, 120> v22;
+  alignas(16) static std::array<float, 32> v23;
+  alignas(16) static std::array<float, 120> v24;
+  alignas(16) static std::array<float, 94080> v25;
+  alignas(16) static std::array<float, 31360> v26;
+  alignas(16) static std::array<float, 31360> v27;
+  alignas(16) static std::array<float, 94080> v28;
+  alignas(16) static std::array<float, 94080> v29;
+  alignas(16) static std::array<float, 120> v30;
+  alignas(16) static std::array<float, 32> v31;
+  alignas(16) static std::array<float, 120> v32;
+  alignas(16) static std::array<float, 94080> v33;
+  alignas(16) static std::array<float, 31360> v34;
+  alignas(16) static std::array<float, 31360> v35;
+  alignas(16) static std::array<float, 188160> v36;
+  alignas(16) static std::array<float, 188160> v37;
+  alignas(16) static std::array<float, 47040> v38;
+  alignas(16) static std::array<float, 47040> v39;
+  alignas(16) static std::array<float, 15680> v40;
+  alignas(16) static std::array<float, 39200> v41;
+  alignas(16) static std::array<float, 39200> v42;
+  alignas(16) static std::array<float, 39200> v43;
+  alignas(16) static std::array<float, 39200> v44;
+  alignas(16) static std::array<float, 15680> v45;
+  alignas(16) static std::array<float, 15680> v46;
+  alignas(16) static std::array<float, 36064> v47;
+  alignas(16) static std::array<float, 36064> v48;
+  alignas(16) static std::array<float, 36064> v49;
+  alignas(16) static std::array<float, 36064> v50;
+  alignas(16) static std::array<float, 15680> v51;
+  alignas(16) static std::array<float, 15680> v52;
+  alignas(16) static std::array<float, 36064> v53;
+  alignas(16) static std::array<float, 36064> v54;
+  alignas(16) static std::array<float, 36064> v55;
+  alignas(16) static std::array<float, 36064> v56;
+  alignas(16) static std::array<float, 15680> v57;
+  alignas(16) static std::array<float, 15680> v58;
+  alignas(16) static std::array<float, 94080> v59;
+  alignas(16) static std::array<float, 94080> v60;
+  alignas(16) static std::array<float, 94080> v61;
+  alignas(16) static std::array<float, 94080> v62;
+  alignas(16) static std::array<float, 480> v63;
+  alignas(16) static std::array<float, 120> v64;
+  alignas(16) static std::array<float, 480> v65;
+  alignas(16) static std::array<float, 94080> v66;
+  alignas(16) static std::array<float, 21952> v67;
+  alignas(16) static std::array<float, 131712> v68;
+  alignas(16) static std::array<float, 131712> v69;
+  alignas(16) static std::array<float, 131712> v70;
+  alignas(16) static std::array<float, 131712> v71;
+  alignas(16) static std::array<float, 672> v72;
+  alignas(16) static std::array<float, 168> v73;
+  alignas(16) static std::array<float, 672> v74;
+  alignas(16) static std::array<float, 131712> v75;
+  alignas(16) static std::array<float, 21952> v76;
+  alignas(16) static std::array<float, 21952> v77;
+  alignas(16) static std::array<float, 131712> v78;
+  alignas(16) static std::array<float, 131712> v79;
+  alignas(16) static std::array<float, 32928> v80;
+  alignas(16) static std::array<float, 32928> v81;
+  alignas(16) static std::array<float, 672> v82;
+  alignas(16) static std::array<float, 168> v83;
+  alignas(16) static std::array<float, 672> v84;
+  alignas(16) static std::array<float, 32928> v85;
+  alignas(16) static std::array<float, 7840> v86;
+  alignas(16) static std::array<float, 47040> v87;
+  alignas(16) static std::array<float, 47040> v88;
+  alignas(16) static std::array<float, 47040> v89;
+  alignas(16) static std::array<float, 47040> v90;
+  alignas(16) static std::array<float, 960> v91;
+  alignas(16) static std::array<float, 240> v92;
+  alignas(16) static std::array<float, 960> v93;
+  alignas(16) static std::array<float, 47040> v94;
+  alignas(16) static std::array<float, 7840> v95;
+  alignas(16) static std::array<float, 7840> v96;
+  alignas(16) static std::array<float, 47040> v97;
+  alignas(16) static std::array<float, 47040> v98;
+  alignas(16) static std::array<float, 47040> v99;
+  alignas(16) static std::array<float, 47040> v100;
+  alignas(16) static std::array<float, 960> v101;
+  alignas(16) static std::array<float, 240> v102;
+  alignas(16) static std::array<float, 960> v103;
+  alignas(16) static std::array<float, 47040> v104;
+  alignas(16) static std::array<float, 7840> v105;
+  alignas(16) static std::array<float, 7840> v106;
+  alignas(16) static std::array<float, 47040> v107;
+  alignas(16) static std::array<float, 47040> v108;
+  alignas(16) static std::array<float, 960> v109;
+  alignas(16) static std::array<float, 1280> v110;
+  alignas(16) static std::array<float, 1280> v111;
+  alignas(16) static std::array<float, 1280> v112;
+  alignas(16) static std::array<float, 1001> v113;
+  alignas(16) static std::array<float, 432> w114;
+  alignas(16) static std::array<float, 16> w115;
+  alignas(16) static std::array<float, 144> w116;
+  alignas(16) static std::array<float, 16> w117;
+  alignas(16) static std::array<float, 256> w118;
+  alignas(16) static std::array<float, 16> w119;
+  alignas(16) static std::array<float, 1024> w120;
+  alignas(16) static std::array<float, 64> w121;
+  alignas(16) static std::array<float, 576> w122;
+  alignas(16) static std::array<float, 64> w123;
+  alignas(16) static std::array<float, 1536> w124;
+  alignas(16) static std::array<float, 24> w125;
+  alignas(16) static std::array<float, 1728> w126;
+  alignas(16) static std::array<float, 72> w127;
+  alignas(16) static std::array<float, 648> w128;
+  alignas(16) static std::array<float, 72> w129;
+  alignas(16) static std::array<float, 1728> w130;
+  alignas(16) static std::array<float, 24> w131;
+  alignas(16) static std::array<float, 1728> w132;
+  alignas(16) static std::array<float, 72> w133;
+  alignas(16) static std::array<float, 1800> w134;
+  alignas(16) static std::array<float, 72> w135;
+  alignas(16) static std::array<float, 1728> w136;
+  alignas(16) static std::array<float, 24> w137;
+  alignas(16) static std::array<float, 1728> w138;
+  alignas(16) static std::array<float, 72> w139;
+  alignas(16) static std::array<float, 2880> w140;
+  alignas(16) static std::array<float, 40> w141;
+  alignas(16) static std::array<float, 4800> w142;
+  alignas(16) static std::array<float, 120> w143;
+  alignas(16) static std::array<float, 3000> w144;
+  alignas(16) static std::array<float, 120> w145;
+  alignas(16) static std::array<float, 3840> w146;
+  alignas(16) static std::array<float, 32> w147;
+  alignas(16) static std::array<float, 3840> w148;
+  alignas(16) static std::array<float, 120> w149;
+  alignas(16) static std::array<float, 4800> w150;
+  alignas(16) static std::array<float, 40> w151;
+  alignas(16) static std::array<float, 4800> w152;
+  alignas(16) static std::array<float, 120> w153;
+  alignas(16) static std::array<float, 3000> w154;
+  alignas(16) static std::array<float, 120> w155;
+  alignas(16) static std::array<float, 3840> w156;
+  alignas(16) static std::array<float, 32> w157;
+  alignas(16) static std::array<float, 3840> w158;
+  alignas(16) static std::array<float, 120> w159;
+  alignas(16) static std::array<float, 4800> w160;
+  alignas(16) static std::array<float, 40> w161;
+  alignas(16) static std::array<float, 9600> w162;
+  alignas(16) static std::array<float, 240> w163;
+  alignas(16) static std::array<float, 2160> w164;
+  alignas(16) static std::array<float, 240> w165;
+  alignas(16) static std::array<float, 19200> w166;
+  alignas(16) static std::array<float, 80> w167;
+  alignas(16) static std::array<float, 16000> w168;
+  alignas(16) static std::array<float, 200> w169;
+  alignas(16) static std::array<float, 1800> w170;
+  alignas(16) static std::array<float, 200> w171;
+  alignas(16) static std::array<float, 16000> w172;
+  alignas(16) static std::array<float, 80> w173;
+  alignas(16) static std::array<float, 14720> w174;
+  alignas(16) static std::array<float, 184> w175;
+  alignas(16) static std::array<float, 1656> w176;
+  alignas(16) static std::array<float, 184> w177;
+  alignas(16) static std::array<float, 14720> w178;
+  alignas(16) static std::array<float, 80> w179;
+  alignas(16) static std::array<float, 14720> w180;
+  alignas(16) static std::array<float, 184> w181;
+  alignas(16) static std::array<float, 1656> w182;
+  alignas(16) static std::array<float, 184> w183;
+  alignas(16) static std::array<float, 14720> w184;
+  alignas(16) static std::array<float, 80> w185;
+  alignas(16) static std::array<float, 38400> w186;
+  alignas(16) static std::array<float, 480> w187;
+  alignas(16) static std::array<float, 4320> w188;
+  alignas(16) static std::array<float, 480> w189;
+  alignas(16) static std::array<float, 57600> w190;
+  alignas(16) static std::array<float, 120> w191;
+  alignas(16) static std::array<float, 57600> w192;
+  alignas(16) static std::array<float, 480> w193;
+  alignas(16) static std::array<float, 53760> w194;
+  alignas(16) static std::array<float, 112> w195;
+  alignas(16) static std::array<float, 75264> w196;
+  alignas(16) static std::array<float, 672> w197;
+  alignas(16) static std::array<float, 6048> w198;
+  alignas(16) static std::array<float, 672> w199;
+  alignas(16) static std::array<float, 112896> w200;
+  alignas(16) static std::array<float, 168> w201;
+  alignas(16) static std::array<float, 112896> w202;
+  alignas(16) static std::array<float, 672> w203;
+  alignas(16) static std::array<float, 75264> w204;
+  alignas(16) static std::array<float, 112> w205;
+  alignas(16) static std::array<float, 75264> w206;
+  alignas(16) static std::array<float, 672> w207;
+  alignas(16) static std::array<float, 16800> w208;
+  alignas(16) static std::array<float, 672> w209;
+  alignas(16) static std::array<float, 112896> w210;
+  alignas(16) static std::array<float, 168> w211;
+  alignas(16) static std::array<float, 112896> w212;
+  alignas(16) static std::array<float, 672> w213;
+  alignas(16) static std::array<float, 107520> w214;
+  alignas(16) static std::array<float, 160> w215;
+  alignas(16) static std::array<float, 153600> w216;
+  alignas(16) static std::array<float, 960> w217;
+  alignas(16) static std::array<float, 24000> w218;
+  alignas(16) static std::array<float, 960> w219;
+  alignas(16) static std::array<float, 230400> w220;
+  alignas(16) static std::array<float, 240> w221;
+  alignas(16) static std::array<float, 230400> w222;
+  alignas(16) static std::array<float, 960> w223;
+  alignas(16) static std::array<float, 153600> w224;
+  alignas(16) static std::array<float, 160> w225;
+  alignas(16) static std::array<float, 153600> w226;
+  alignas(16) static std::array<float, 960> w227;
+  alignas(16) static std::array<float, 24000> w228;
+  alignas(16) static std::array<float, 960> w229;
+  alignas(16) static std::array<float, 230400> w230;
+  alignas(16) static std::array<float, 240> w231;
+  alignas(16) static std::array<float, 230400> w232;
+  alignas(16) static std::array<float, 960> w233;
+  alignas(16) static std::array<float, 153600> w234;
+  alignas(16) static std::array<float, 160> w235;
+  alignas(16) static std::array<float, 153600> w236;
+  alignas(16) static std::array<float, 960> w237;
+  alignas(16) static std::array<float, 1228800> w238;
+  alignas(16) static std::array<float, 1280> w239;
+  alignas(16) static std::array<float, 1281280> w240;
+  alignas(16) static std::array<float, 1001> w241;
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.0f, +1.0f), std::ref(rng));
+  std::generate(v0.begin(), v0.end(), std::ref(f32rng));
+  std::generate(v1.begin(), v1.end(), std::ref(f32rng));
+  std::generate(v2.begin(), v2.end(), std::ref(f32rng));
+  std::generate(v3.begin(), v3.end(), std::ref(f32rng));
+  std::generate(v4.begin(), v4.end(), std::ref(f32rng));
+  std::generate(v5.begin(), v5.end(), std::ref(f32rng));
+  std::generate(v6.begin(), v6.end(), std::ref(f32rng));
+  std::generate(v7.begin(), v7.end(), std::ref(f32rng));
+  std::generate(v8.begin(), v8.end(), std::ref(f32rng));
+  std::generate(v9.begin(), v9.end(), std::ref(f32rng));
+  std::generate(v10.begin(), v10.end(), std::ref(f32rng));
+  std::generate(v11.begin(), v11.end(), std::ref(f32rng));
+  std::generate(v12.begin(), v12.end(), std::ref(f32rng));
+  std::generate(v13.begin(), v13.end(), std::ref(f32rng));
+  std::generate(v14.begin(), v14.end(), std::ref(f32rng));
+  std::generate(v15.begin(), v15.end(), std::ref(f32rng));
+  std::generate(v16.begin(), v16.end(), std::ref(f32rng));
+  std::generate(v17.begin(), v17.end(), std::ref(f32rng));
+  std::generate(v18.begin(), v18.end(), std::ref(f32rng));
+  std::generate(v19.begin(), v19.end(), std::ref(f32rng));
+  std::generate(v20.begin(), v20.end(), std::ref(f32rng));
+  std::generate(v21.begin(), v21.end(), std::ref(f32rng));
+  std::generate(v22.begin(), v22.end(), std::ref(f32rng));
+  std::generate(v23.begin(), v23.end(), std::ref(f32rng));
+  std::generate(v24.begin(), v24.end(), std::ref(f32rng));
+  std::generate(v25.begin(), v25.end(), std::ref(f32rng));
+  std::generate(v26.begin(), v26.end(), std::ref(f32rng));
+  std::generate(v27.begin(), v27.end(), std::ref(f32rng));
+  std::generate(v28.begin(), v28.end(), std::ref(f32rng));
+  std::generate(v29.begin(), v29.end(), std::ref(f32rng));
+  std::generate(v30.begin(), v30.end(), std::ref(f32rng));
+  std::generate(v31.begin(), v31.end(), std::ref(f32rng));
+  std::generate(v32.begin(), v32.end(), std::ref(f32rng));
+  std::generate(v33.begin(), v33.end(), std::ref(f32rng));
+  std::generate(v34.begin(), v34.end(), std::ref(f32rng));
+  std::generate(v35.begin(), v35.end(), std::ref(f32rng));
+  std::generate(v36.begin(), v36.end(), std::ref(f32rng));
+  std::generate(v37.begin(), v37.end(), std::ref(f32rng));
+  std::generate(v38.begin(), v38.end(), std::ref(f32rng));
+  std::generate(v39.begin(), v39.end(), std::ref(f32rng));
+  std::generate(v40.begin(), v40.end(), std::ref(f32rng));
+  std::generate(v41.begin(), v41.end(), std::ref(f32rng));
+  std::generate(v42.begin(), v42.end(), std::ref(f32rng));
+  std::generate(v43.begin(), v43.end(), std::ref(f32rng));
+  std::generate(v44.begin(), v44.end(), std::ref(f32rng));
+  std::generate(v45.begin(), v45.end(), std::ref(f32rng));
+  std::generate(v46.begin(), v46.end(), std::ref(f32rng));
+  std::generate(v47.begin(), v47.end(), std::ref(f32rng));
+  std::generate(v48.begin(), v48.end(), std::ref(f32rng));
+  std::generate(v49.begin(), v49.end(), std::ref(f32rng));
+  std::generate(v50.begin(), v50.end(), std::ref(f32rng));
+  std::generate(v51.begin(), v51.end(), std::ref(f32rng));
+  std::generate(v52.begin(), v52.end(), std::ref(f32rng));
+  std::generate(v53.begin(), v53.end(), std::ref(f32rng));
+  std::generate(v54.begin(), v54.end(), std::ref(f32rng));
+  std::generate(v55.begin(), v55.end(), std::ref(f32rng));
+  std::generate(v56.begin(), v56.end(), std::ref(f32rng));
+  std::generate(v57.begin(), v57.end(), std::ref(f32rng));
+  std::generate(v58.begin(), v58.end(), std::ref(f32rng));
+  std::generate(v59.begin(), v59.end(), std::ref(f32rng));
+  std::generate(v60.begin(), v60.end(), std::ref(f32rng));
+  std::generate(v61.begin(), v61.end(), std::ref(f32rng));
+  std::generate(v62.begin(), v62.end(), std::ref(f32rng));
+  std::generate(v63.begin(), v63.end(), std::ref(f32rng));
+  std::generate(v64.begin(), v64.end(), std::ref(f32rng));
+  std::generate(v65.begin(), v65.end(), std::ref(f32rng));
+  std::generate(v66.begin(), v66.end(), std::ref(f32rng));
+  std::generate(v67.begin(), v67.end(), std::ref(f32rng));
+  std::generate(v68.begin(), v68.end(), std::ref(f32rng));
+  std::generate(v69.begin(), v69.end(), std::ref(f32rng));
+  std::generate(v70.begin(), v70.end(), std::ref(f32rng));
+  std::generate(v71.begin(), v71.end(), std::ref(f32rng));
+  std::generate(v72.begin(), v72.end(), std::ref(f32rng));
+  std::generate(v73.begin(), v73.end(), std::ref(f32rng));
+  std::generate(v74.begin(), v74.end(), std::ref(f32rng));
+  std::generate(v75.begin(), v75.end(), std::ref(f32rng));
+  std::generate(v76.begin(), v76.end(), std::ref(f32rng));
+  std::generate(v77.begin(), v77.end(), std::ref(f32rng));
+  std::generate(v78.begin(), v78.end(), std::ref(f32rng));
+  std::generate(v79.begin(), v79.end(), std::ref(f32rng));
+  std::generate(v80.begin(), v80.end(), std::ref(f32rng));
+  std::generate(v81.begin(), v81.end(), std::ref(f32rng));
+  std::generate(v82.begin(), v82.end(), std::ref(f32rng));
+  std::generate(v83.begin(), v83.end(), std::ref(f32rng));
+  std::generate(v84.begin(), v84.end(), std::ref(f32rng));
+  std::generate(v85.begin(), v85.end(), std::ref(f32rng));
+  std::generate(v86.begin(), v86.end(), std::ref(f32rng));
+  std::generate(v87.begin(), v87.end(), std::ref(f32rng));
+  std::generate(v88.begin(), v88.end(), std::ref(f32rng));
+  std::generate(v89.begin(), v89.end(), std::ref(f32rng));
+  std::generate(v90.begin(), v90.end(), std::ref(f32rng));
+  std::generate(v91.begin(), v91.end(), std::ref(f32rng));
+  std::generate(v92.begin(), v92.end(), std::ref(f32rng));
+  std::generate(v93.begin(), v93.end(), std::ref(f32rng));
+  std::generate(v94.begin(), v94.end(), std::ref(f32rng));
+  std::generate(v95.begin(), v95.end(), std::ref(f32rng));
+  std::generate(v96.begin(), v96.end(), std::ref(f32rng));
+  std::generate(v97.begin(), v97.end(), std::ref(f32rng));
+  std::generate(v98.begin(), v98.end(), std::ref(f32rng));
+  std::generate(v99.begin(), v99.end(), std::ref(f32rng));
+  std::generate(v100.begin(), v100.end(), std::ref(f32rng));
+  std::generate(v101.begin(), v101.end(), std::ref(f32rng));
+  std::generate(v102.begin(), v102.end(), std::ref(f32rng));
+  std::generate(v103.begin(), v103.end(), std::ref(f32rng));
+  std::generate(v104.begin(), v104.end(), std::ref(f32rng));
+  std::generate(v105.begin(), v105.end(), std::ref(f32rng));
+  std::generate(v106.begin(), v106.end(), std::ref(f32rng));
+  std::generate(v107.begin(), v107.end(), std::ref(f32rng));
+  std::generate(v108.begin(), v108.end(), std::ref(f32rng));
+  std::generate(v109.begin(), v109.end(), std::ref(f32rng));
+  std::generate(v110.begin(), v110.end(), std::ref(f32rng));
+  std::generate(v111.begin(), v111.end(), std::ref(f32rng));
+  std::generate(v112.begin(), v112.end(), std::ref(f32rng));
+  std::generate(v113.begin(), v113.end(), std::ref(f32rng));
+  std::generate(w114.begin(), w114.end(), std::ref(f32rng));
+  std::generate(w115.begin(), w115.end(), std::ref(f32rng));
+  std::generate(w116.begin(), w116.end(), std::ref(f32rng));
+  std::generate(w117.begin(), w117.end(), std::ref(f32rng));
+  std::fill(w118.begin(), w118.end(), 0.0f);
+  std::generate(w118.begin(), w118.end() - size_t(sparsity * w118.size()), std::ref(f32rng));
+  std::shuffle(w118.begin(), w118.end(), rng);
+  std::generate(w119.begin(), w119.end(), std::ref(f32rng));
+  std::fill(w120.begin(), w120.end(), 0.0f);
+  std::generate(w120.begin(), w120.end() - size_t(sparsity * w120.size()), std::ref(f32rng));
+  std::shuffle(w120.begin(), w120.end(), rng);
+  std::generate(w121.begin(), w121.end(), std::ref(f32rng));
+  std::generate(w122.begin(), w122.end(), std::ref(f32rng));
+  std::generate(w123.begin(), w123.end(), std::ref(f32rng));
+  std::fill(w124.begin(), w124.end(), 0.0f);
+  std::generate(w124.begin(), w124.end() - size_t(sparsity * w124.size()), std::ref(f32rng));
+  std::shuffle(w124.begin(), w124.end(), rng);
+  std::generate(w125.begin(), w125.end(), std::ref(f32rng));
+  std::fill(w126.begin(), w126.end(), 0.0f);
+  std::generate(w126.begin(), w126.end() - size_t(sparsity * w126.size()), std::ref(f32rng));
+  std::shuffle(w126.begin(), w126.end(), rng);
+  std::generate(w127.begin(), w127.end(), std::ref(f32rng));
+  std::generate(w128.begin(), w128.end(), std::ref(f32rng));
+  std::generate(w129.begin(), w129.end(), std::ref(f32rng));
+  std::fill(w130.begin(), w130.end(), 0.0f);
+  std::generate(w130.begin(), w130.end() - size_t(sparsity * w130.size()), std::ref(f32rng));
+  std::shuffle(w130.begin(), w130.end(), rng);
+  std::generate(w131.begin(), w131.end(), std::ref(f32rng));
+  std::fill(w132.begin(), w132.end(), 0.0f);
+  std::generate(w132.begin(), w132.end() - size_t(sparsity * w132.size()), std::ref(f32rng));
+  std::shuffle(w132.begin(), w132.end(), rng);
+  std::generate(w133.begin(), w133.end(), std::ref(f32rng));
+  std::generate(w134.begin(), w134.end(), std::ref(f32rng));
+  std::generate(w135.begin(), w135.end(), std::ref(f32rng));
+  std::fill(w136.begin(), w136.end(), 0.0f);
+  std::generate(w136.begin(), w136.end() - size_t(sparsity * w136.size()), std::ref(f32rng));
+  std::shuffle(w136.begin(), w136.end(), rng);
+  std::generate(w137.begin(), w137.end(), std::ref(f32rng));
+  std::fill(w138.begin(), w138.end(), 0.0f);
+  std::generate(w138.begin(), w138.end() - size_t(sparsity * w138.size()), std::ref(f32rng));
+  std::shuffle(w138.begin(), w138.end(), rng);
+  std::generate(w139.begin(), w139.end(), std::ref(f32rng));
+  std::fill(w140.begin(), w140.end(), 0.0f);
+  std::generate(w140.begin(), w140.end() - size_t(sparsity * w140.size()), std::ref(f32rng));
+  std::shuffle(w140.begin(), w140.end(), rng);
+  std::generate(w141.begin(), w141.end(), std::ref(f32rng));
+  std::fill(w142.begin(), w142.end(), 0.0f);
+  std::generate(w142.begin(), w142.end() - size_t(sparsity * w142.size()), std::ref(f32rng));
+  std::shuffle(w142.begin(), w142.end(), rng);
+  std::generate(w143.begin(), w143.end(), std::ref(f32rng));
+  std::generate(w144.begin(), w144.end(), std::ref(f32rng));
+  std::generate(w145.begin(), w145.end(), std::ref(f32rng));
+  std::fill(w146.begin(), w146.end(), 0.0f);
+  std::generate(w146.begin(), w146.end() - size_t(sparsity * w146.size()), std::ref(f32rng));
+  std::shuffle(w146.begin(), w146.end(), rng);
+  std::generate(w147.begin(), w147.end(), std::ref(f32rng));
+  std::fill(w148.begin(), w148.end(), 0.0f);
+  std::generate(w148.begin(), w148.end() - size_t(sparsity * w148.size()), std::ref(f32rng));
+  std::shuffle(w148.begin(), w148.end(), rng);
+  std::generate(w149.begin(), w149.end(), std::ref(f32rng));
+  std::fill(w150.begin(), w150.end(), 0.0f);
+  std::generate(w150.begin(), w150.end() - size_t(sparsity * w150.size()), std::ref(f32rng));
+  std::shuffle(w150.begin(), w150.end(), rng);
+  std::generate(w151.begin(), w151.end(), std::ref(f32rng));
+  std::fill(w152.begin(), w152.end(), 0.0f);
+  std::generate(w152.begin(), w152.end() - size_t(sparsity * w152.size()), std::ref(f32rng));
+  std::shuffle(w152.begin(), w152.end(), rng);
+  std::generate(w153.begin(), w153.end(), std::ref(f32rng));
+  std::generate(w154.begin(), w154.end(), std::ref(f32rng));
+  std::generate(w155.begin(), w155.end(), std::ref(f32rng));
+  std::fill(w156.begin(), w156.end(), 0.0f);
+  std::generate(w156.begin(), w156.end() - size_t(sparsity * w156.size()), std::ref(f32rng));
+  std::shuffle(w156.begin(), w156.end(), rng);
+  std::generate(w157.begin(), w157.end(), std::ref(f32rng));
+  std::fill(w158.begin(), w158.end(), 0.0f);
+  std::generate(w158.begin(), w158.end() - size_t(sparsity * w158.size()), std::ref(f32rng));
+  std::shuffle(w158.begin(), w158.end(), rng);
+  std::generate(w159.begin(), w159.end(), std::ref(f32rng));
+  std::fill(w160.begin(), w160.end(), 0.0f);
+  std::generate(w160.begin(), w160.end() - size_t(sparsity * w160.size()), std::ref(f32rng));
+  std::shuffle(w160.begin(), w160.end(), rng);
+  std::generate(w161.begin(), w161.end(), std::ref(f32rng));
+  std::fill(w162.begin(), w162.end(), 0.0f);
+  std::generate(w162.begin(), w162.end() - size_t(sparsity * w162.size()), std::ref(f32rng));
+  std::shuffle(w162.begin(), w162.end(), rng);
+  std::generate(w163.begin(), w163.end(), std::ref(f32rng));
+  std::generate(w164.begin(), w164.end(), std::ref(f32rng));
+  std::generate(w165.begin(), w165.end(), std::ref(f32rng));
+  std::fill(w166.begin(), w166.end(), 0.0f);
+  std::generate(w166.begin(), w166.end() - size_t(sparsity * w166.size()), std::ref(f32rng));
+  std::shuffle(w166.begin(), w166.end(), rng);
+  std::generate(w167.begin(), w167.end(), std::ref(f32rng));
+  std::fill(w168.begin(), w168.end(), 0.0f);
+  std::generate(w168.begin(), w168.end() - size_t(sparsity * w168.size()), std::ref(f32rng));
+  std::shuffle(w168.begin(), w168.end(), rng);
+  std::generate(w169.begin(), w169.end(), std::ref(f32rng));
+  std::generate(w170.begin(), w170.end(), std::ref(f32rng));
+  std::generate(w171.begin(), w171.end(), std::ref(f32rng));
+  std::fill(w172.begin(), w172.end(), 0.0f);
+  std::generate(w172.begin(), w172.end() - size_t(sparsity * w172.size()), std::ref(f32rng));
+  std::shuffle(w172.begin(), w172.end(), rng);
+  std::generate(w173.begin(), w173.end(), std::ref(f32rng));
+  std::fill(w174.begin(), w174.end(), 0.0f);
+  std::generate(w174.begin(), w174.end() - size_t(sparsity * w174.size()), std::ref(f32rng));
+  std::shuffle(w174.begin(), w174.end(), rng);
+  std::generate(w175.begin(), w175.end(), std::ref(f32rng));
+  std::generate(w176.begin(), w176.end(), std::ref(f32rng));
+  std::generate(w177.begin(), w177.end(), std::ref(f32rng));
+  std::fill(w178.begin(), w178.end(), 0.0f);
+  std::generate(w178.begin(), w178.end() - size_t(sparsity * w178.size()), std::ref(f32rng));
+  std::shuffle(w178.begin(), w178.end(), rng);
+  std::generate(w179.begin(), w179.end(), std::ref(f32rng));
+  std::fill(w180.begin(), w180.end(), 0.0f);
+  std::generate(w180.begin(), w180.end() - size_t(sparsity * w180.size()), std::ref(f32rng));
+  std::shuffle(w180.begin(), w180.end(), rng);
+  std::generate(w181.begin(), w181.end(), std::ref(f32rng));
+  std::generate(w182.begin(), w182.end(), std::ref(f32rng));
+  std::generate(w183.begin(), w183.end(), std::ref(f32rng));
+  std::fill(w184.begin(), w184.end(), 0.0f);
+  std::generate(w184.begin(), w184.end() - size_t(sparsity * w184.size()), std::ref(f32rng));
+  std::shuffle(w184.begin(), w184.end(), rng);
+  std::generate(w185.begin(), w185.end(), std::ref(f32rng));
+  std::fill(w186.begin(), w186.end(), 0.0f);
+  std::generate(w186.begin(), w186.end() - size_t(sparsity * w186.size()), std::ref(f32rng));
+  std::shuffle(w186.begin(), w186.end(), rng);
+  std::generate(w187.begin(), w187.end(), std::ref(f32rng));
+  std::generate(w188.begin(), w188.end(), std::ref(f32rng));
+  std::generate(w189.begin(), w189.end(), std::ref(f32rng));
+  std::fill(w190.begin(), w190.end(), 0.0f);
+  std::generate(w190.begin(), w190.end() - size_t(sparsity * w190.size()), std::ref(f32rng));
+  std::shuffle(w190.begin(), w190.end(), rng);
+  std::generate(w191.begin(), w191.end(), std::ref(f32rng));
+  std::fill(w192.begin(), w192.end(), 0.0f);
+  std::generate(w192.begin(), w192.end() - size_t(sparsity * w192.size()), std::ref(f32rng));
+  std::shuffle(w192.begin(), w192.end(), rng);
+  std::generate(w193.begin(), w193.end(), std::ref(f32rng));
+  std::fill(w194.begin(), w194.end(), 0.0f);
+  std::generate(w194.begin(), w194.end() - size_t(sparsity * w194.size()), std::ref(f32rng));
+  std::shuffle(w194.begin(), w194.end(), rng);
+  std::generate(w195.begin(), w195.end(), std::ref(f32rng));
+  std::fill(w196.begin(), w196.end(), 0.0f);
+  std::generate(w196.begin(), w196.end() - size_t(sparsity * w196.size()), std::ref(f32rng));
+  std::shuffle(w196.begin(), w196.end(), rng);
+  std::generate(w197.begin(), w197.end(), std::ref(f32rng));
+  std::generate(w198.begin(), w198.end(), std::ref(f32rng));
+  std::generate(w199.begin(), w199.end(), std::ref(f32rng));
+  std::fill(w200.begin(), w200.end(), 0.0f);
+  std::generate(w200.begin(), w200.end() - size_t(sparsity * w200.size()), std::ref(f32rng));
+  std::shuffle(w200.begin(), w200.end(), rng);
+  std::generate(w201.begin(), w201.end(), std::ref(f32rng));
+  std::fill(w202.begin(), w202.end(), 0.0f);
+  std::generate(w202.begin(), w202.end() - size_t(sparsity * w202.size()), std::ref(f32rng));
+  std::shuffle(w202.begin(), w202.end(), rng);
+  std::generate(w203.begin(), w203.end(), std::ref(f32rng));
+  std::fill(w204.begin(), w204.end(), 0.0f);
+  std::generate(w204.begin(), w204.end() - size_t(sparsity * w204.size()), std::ref(f32rng));
+  std::shuffle(w204.begin(), w204.end(), rng);
+  std::generate(w205.begin(), w205.end(), std::ref(f32rng));
+  std::fill(w206.begin(), w206.end(), 0.0f);
+  std::generate(w206.begin(), w206.end() - size_t(sparsity * w206.size()), std::ref(f32rng));
+  std::shuffle(w206.begin(), w206.end(), rng);
+  std::generate(w207.begin(), w207.end(), std::ref(f32rng));
+  std::generate(w208.begin(), w208.end(), std::ref(f32rng));
+  std::generate(w209.begin(), w209.end(), std::ref(f32rng));
+  std::fill(w210.begin(), w210.end(), 0.0f);
+  std::generate(w210.begin(), w210.end() - size_t(sparsity * w210.size()), std::ref(f32rng));
+  std::shuffle(w210.begin(), w210.end(), rng);
+  std::generate(w211.begin(), w211.end(), std::ref(f32rng));
+  std::fill(w212.begin(), w212.end(), 0.0f);
+  std::generate(w212.begin(), w212.end() - size_t(sparsity * w212.size()), std::ref(f32rng));
+  std::shuffle(w212.begin(), w212.end(), rng);
+  std::generate(w213.begin(), w213.end(), std::ref(f32rng));
+  std::fill(w214.begin(), w214.end(), 0.0f);
+  std::generate(w214.begin(), w214.end() - size_t(sparsity * w214.size()), std::ref(f32rng));
+  std::shuffle(w214.begin(), w214.end(), rng);
+  std::generate(w215.begin(), w215.end(), std::ref(f32rng));
+  std::fill(w216.begin(), w216.end(), 0.0f);
+  std::generate(w216.begin(), w216.end() - size_t(sparsity * w216.size()), std::ref(f32rng));
+  std::shuffle(w216.begin(), w216.end(), rng);
+  std::generate(w217.begin(), w217.end(), std::ref(f32rng));
+  std::generate(w218.begin(), w218.end(), std::ref(f32rng));
+  std::generate(w219.begin(), w219.end(), std::ref(f32rng));
+  std::fill(w220.begin(), w220.end(), 0.0f);
+  std::generate(w220.begin(), w220.end() - size_t(sparsity * w220.size()), std::ref(f32rng));
+  std::shuffle(w220.begin(), w220.end(), rng);
+  std::generate(w221.begin(), w221.end(), std::ref(f32rng));
+  std::fill(w222.begin(), w222.end(), 0.0f);
+  std::generate(w222.begin(), w222.end() - size_t(sparsity * w222.size()), std::ref(f32rng));
+  std::shuffle(w222.begin(), w222.end(), rng);
+  std::generate(w223.begin(), w223.end(), std::ref(f32rng));
+  std::fill(w224.begin(), w224.end(), 0.0f);
+  std::generate(w224.begin(), w224.end() - size_t(sparsity * w224.size()), std::ref(f32rng));
+  std::shuffle(w224.begin(), w224.end(), rng);
+  std::generate(w225.begin(), w225.end(), std::ref(f32rng));
+  std::fill(w226.begin(), w226.end(), 0.0f);
+  std::generate(w226.begin(), w226.end() - size_t(sparsity * w226.size()), std::ref(f32rng));
+  std::shuffle(w226.begin(), w226.end(), rng);
+  std::generate(w227.begin(), w227.end(), std::ref(f32rng));
+  std::generate(w228.begin(), w228.end(), std::ref(f32rng));
+  std::generate(w229.begin(), w229.end(), std::ref(f32rng));
+  std::fill(w230.begin(), w230.end(), 0.0f);
+  std::generate(w230.begin(), w230.end() - size_t(sparsity * w230.size()), std::ref(f32rng));
+  std::shuffle(w230.begin(), w230.end(), rng);
+  std::generate(w231.begin(), w231.end(), std::ref(f32rng));
+  std::fill(w232.begin(), w232.end(), 0.0f);
+  std::generate(w232.begin(), w232.end() - size_t(sparsity * w232.size()), std::ref(f32rng));
+  std::shuffle(w232.begin(), w232.end(), rng);
+  std::generate(w233.begin(), w233.end(), std::ref(f32rng));
+  std::fill(w234.begin(), w234.end(), 0.0f);
+  std::generate(w234.begin(), w234.end() - size_t(sparsity * w234.size()), std::ref(f32rng));
+  std::shuffle(w234.begin(), w234.end(), rng);
+  std::generate(w235.begin(), w235.end(), std::ref(f32rng));
+  std::fill(w236.begin(), w236.end(), 0.0f);
+  std::generate(w236.begin(), w236.end() - size_t(sparsity * w236.size()), std::ref(f32rng));
+  std::shuffle(w236.begin(), w236.end(), rng);
+  std::generate(w237.begin(), w237.end(), std::ref(f32rng));
+  std::generate(w238.begin(), w238.end(), std::ref(f32rng));
+  std::generate(w239.begin(), w239.end(), std::ref(f32rng));
+  std::generate(w240.begin(), w240.end(), std::ref(f32rng));
+  std::generate(w241.begin(), w241.end(), std::ref(f32rng));
+
+  ExecutionPlan operators;
+  xnn_status status;
+
+  xnn_operator_t op0 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    2 /* subsampling height */, 2 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    3 /* input channels per group */,
+    16 /* output_channels_per_group */,
+    3 /* input pixel stride */,
+    16 /* output pixel stride */,
+    w114.data(), w115.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    XNN_FLAG_INPUT_NHWC /* flags */,
+    &op0);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #0" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op0, xnn_delete_operator);
+
+  xnn_operator_t op1 = nullptr;
+  status = xnn_create_hardswish_nc_f32(
+    16 /* channels */,
+    16 /* input stride */,
+    16 /* output stride */,
+    0 /* flags */,
+    &op1);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #1" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op1, xnn_delete_operator);
+
+  xnn_operator_t op2 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    16 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    16 /* input pixel stride */,
+    16 /* output pixel stride */,
+    w116.data(), w117.data(),
+    0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op2);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #2" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op2, xnn_delete_operator);
+
+  xnn_operator_t op3 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    16 /* input channels per group */,
+    16 /* output_channels_per_group */,
+    16 /* input pixel stride */,
+    16 /* output pixel stride */,
+    w118.data(), w119.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op3);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #3" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op3, xnn_delete_operator);
+
+  xnn_operator_t op4 = nullptr;
+  status = xnn_create_add_nd_f32(
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op4);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #4" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op4, xnn_delete_operator);
+
+  xnn_operator_t op5 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    16 /* input channels per group */,
+    64 /* output_channels_per_group */,
+    16 /* input pixel stride */,
+    64 /* output pixel stride */,
+    w120.data(), w121.data(),
+    0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op5);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #5" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op5, xnn_delete_operator);
+
+  xnn_operator_t op6 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    2 /* subsampling height */, 2 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    64 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    64 /* input pixel stride */,
+    64 /* output pixel stride */,
+    w122.data(), w123.data(),
+    0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op6);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #6" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op6, xnn_delete_operator);
+
+  xnn_operator_t op7 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    64 /* input channels per group */,
+    24 /* output_channels_per_group */,
+    64 /* input pixel stride */,
+    24 /* output pixel stride */,
+    w124.data(), w125.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op7);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #7" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op7, xnn_delete_operator);
+
+  xnn_operator_t op8 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    24 /* input channels per group */,
+    72 /* output_channels_per_group */,
+    24 /* input pixel stride */,
+    72 /* output pixel stride */,
+    w126.data(), w127.data(),
+    0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op8);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #8" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op8, xnn_delete_operator);
+
+  xnn_operator_t op9 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    72 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    72 /* input pixel stride */,
+    72 /* output pixel stride */,
+    w128.data(), w129.data(),
+    0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op9);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #9" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op9, xnn_delete_operator);
+
+  xnn_operator_t op10 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    72 /* input channels per group */,
+    24 /* output_channels_per_group */,
+    72 /* input pixel stride */,
+    24 /* output pixel stride */,
+    w130.data(), w131.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op10);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #10" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op10, xnn_delete_operator);
+
+  xnn_operator_t op11 = nullptr;
+  status = xnn_create_add_nd_f32(
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op11);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #11" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op11, xnn_delete_operator);
+
+  xnn_operator_t op12 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    24 /* input channels per group */,
+    72 /* output_channels_per_group */,
+    24 /* input pixel stride */,
+    72 /* output pixel stride */,
+    w132.data(), w133.data(),
+    0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op12);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #12" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op12, xnn_delete_operator);
+
+  xnn_operator_t op13 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    2 /* top padding */, 2 /* right padding */,
+    2 /* bottom padding */, 2 /* left padding */,
+    5 /* kernel height */, 5 /* kernel width */,
+    2 /* subsampling height */, 2 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    72 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    72 /* input pixel stride */,
+    72 /* output pixel stride */,
+    w134.data(), w135.data(),
+    0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op13);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #13" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op13, xnn_delete_operator);
+
+  xnn_operator_t op14 = nullptr;
+  status = xnn_create_global_average_pooling_ncw_f32(
+    72 /* channels */,
+    -std::numeric_limits<float>::infinity(), std::numeric_limits<float>::infinity(),
+    0 /* flags */,
+    &op14);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #14" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op14, xnn_delete_operator);
+
+  xnn_operator_t op15 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    72 /* input channels per group */,
+    24 /* output_channels_per_group */,
+    72 /* input pixel stride */,
+    24 /* output pixel stride */,
+    w136.data(), w137.data(),
+    0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op15);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #15" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op15, xnn_delete_operator);
+
+  xnn_operator_t op16 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    24 /* input channels per group */,
+    72 /* output_channels_per_group */,
+    24 /* input pixel stride */,
+    72 /* output pixel stride */,
+    w138.data(), w139.data(),
+    0.0f /* output min */, +0x1.00014Fp+0 /* output max */,
+    0 /* flags */,
+    &op16);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #16" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op16, xnn_delete_operator);
+
+  xnn_operator_t op17 = nullptr;
+  status = xnn_create_multiply_nd_f32(
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op17);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #17" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op17, xnn_delete_operator);
+
+  xnn_operator_t op18 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    72 /* input channels per group */,
+    40 /* output_channels_per_group */,
+    72 /* input pixel stride */,
+    40 /* output pixel stride */,
+    w140.data(), w141.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op18);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #18" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op18, xnn_delete_operator);
+
+  xnn_operator_t op19 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    40 /* input channels per group */,
+    120 /* output_channels_per_group */,
+    40 /* input pixel stride */,
+    120 /* output pixel stride */,
+    w142.data(), w143.data(),
+    0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op19);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #19" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op19, xnn_delete_operator);
+
+  xnn_operator_t op20 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    2 /* top padding */, 2 /* right padding */,
+    2 /* bottom padding */, 2 /* left padding */,
+    5 /* kernel height */, 5 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    120 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    120 /* input pixel stride */,
+    120 /* output pixel stride */,
+    w144.data(), w145.data(),
+    0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op20);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #20" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op20, xnn_delete_operator);
+
+  xnn_operator_t op21 = nullptr;
+  status = xnn_create_global_average_pooling_ncw_f32(
+    120 /* channels */,
+    -std::numeric_limits<float>::infinity(), std::numeric_limits<float>::infinity(),
+    0 /* flags */,
+    &op21);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #21" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op21, xnn_delete_operator);
+
+  xnn_operator_t op22 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    120 /* input channels per group */,
+    32 /* output_channels_per_group */,
+    120 /* input pixel stride */,
+    32 /* output pixel stride */,
+    w146.data(), w147.data(),
+    0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op22);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #22" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op22, xnn_delete_operator);
+
+  xnn_operator_t op23 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    32 /* input channels per group */,
+    120 /* output_channels_per_group */,
+    32 /* input pixel stride */,
+    120 /* output pixel stride */,
+    w148.data(), w149.data(),
+    0.0f /* output min */, +0x1.00014Fp+0 /* output max */,
+    0 /* flags */,
+    &op23);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #23" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op23, xnn_delete_operator);
+
+  xnn_operator_t op24 = nullptr;
+  status = xnn_create_multiply_nd_f32(
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op24);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #24" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op24, xnn_delete_operator);
+
+  xnn_operator_t op25 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    120 /* input channels per group */,
+    40 /* output_channels_per_group */,
+    120 /* input pixel stride */,
+    40 /* output pixel stride */,
+    w150.data(), w151.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op25);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #25" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op25, xnn_delete_operator);
+
+  xnn_operator_t op26 = nullptr;
+  status = xnn_create_add_nd_f32(
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op26);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #26" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op26, xnn_delete_operator);
+
+  xnn_operator_t op27 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    40 /* input channels per group */,
+    120 /* output_channels_per_group */,
+    40 /* input pixel stride */,
+    120 /* output pixel stride */,
+    w152.data(), w153.data(),
+    0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op27);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #27" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op27, xnn_delete_operator);
+
+  xnn_operator_t op28 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    2 /* top padding */, 2 /* right padding */,
+    2 /* bottom padding */, 2 /* left padding */,
+    5 /* kernel height */, 5 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    120 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    120 /* input pixel stride */,
+    120 /* output pixel stride */,
+    w154.data(), w155.data(),
+    0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op28);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #28" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op28, xnn_delete_operator);
+
+  xnn_operator_t op29 = nullptr;
+  status = xnn_create_global_average_pooling_ncw_f32(
+    120 /* channels */,
+    -std::numeric_limits<float>::infinity(), std::numeric_limits<float>::infinity(),
+    0 /* flags */,
+    &op29);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #29" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op29, xnn_delete_operator);
+
+  xnn_operator_t op30 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    120 /* input channels per group */,
+    32 /* output_channels_per_group */,
+    120 /* input pixel stride */,
+    32 /* output pixel stride */,
+    w156.data(), w157.data(),
+    0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op30);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #30" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op30, xnn_delete_operator);
+
+  xnn_operator_t op31 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    32 /* input channels per group */,
+    120 /* output_channels_per_group */,
+    32 /* input pixel stride */,
+    120 /* output pixel stride */,
+    w158.data(), w159.data(),
+    0.0f /* output min */, +0x1.00014Fp+0 /* output max */,
+    0 /* flags */,
+    &op31);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #31" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op31, xnn_delete_operator);
+
+  xnn_operator_t op32 = nullptr;
+  status = xnn_create_multiply_nd_f32(
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op32);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #32" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op32, xnn_delete_operator);
+
+  xnn_operator_t op33 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    120 /* input channels per group */,
+    40 /* output_channels_per_group */,
+    120 /* input pixel stride */,
+    40 /* output pixel stride */,
+    w160.data(), w161.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op33);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #33" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op33, xnn_delete_operator);
+
+  xnn_operator_t op34 = nullptr;
+  status = xnn_create_add_nd_f32(
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op34);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #34" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op34, xnn_delete_operator);
+
+  xnn_operator_t op35 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    40 /* input channels per group */,
+    240 /* output_channels_per_group */,
+    40 /* input pixel stride */,
+    240 /* output pixel stride */,
+    w162.data(), w163.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op35);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #35" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op35, xnn_delete_operator);
+
+  xnn_operator_t op36 = nullptr;
+  status = xnn_create_hardswish_nc_f32(
+    240 /* channels */,
+    240 /* input stride */,
+    240 /* output stride */,
+    0 /* flags */,
+    &op36);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #36" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op36, xnn_delete_operator);
+
+  xnn_operator_t op37 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    2 /* subsampling height */, 2 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    240 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    240 /* input pixel stride */,
+    240 /* output pixel stride */,
+    w164.data(), w165.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op37);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #37" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op37, xnn_delete_operator);
+
+  xnn_operator_t op38 = nullptr;
+  status = xnn_create_hardswish_nc_f32(
+    240 /* channels */,
+    240 /* input stride */,
+    240 /* output stride */,
+    0 /* flags */,
+    &op38);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #38" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op38, xnn_delete_operator);
+
+  xnn_operator_t op39 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    240 /* input channels per group */,
+    80 /* output_channels_per_group */,
+    240 /* input pixel stride */,
+    80 /* output pixel stride */,
+    w166.data(), w167.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op39);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #39" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op39, xnn_delete_operator);
+
+  xnn_operator_t op40 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    80 /* input channels per group */,
+    200 /* output_channels_per_group */,
+    80 /* input pixel stride */,
+    200 /* output pixel stride */,
+    w168.data(), w169.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op40);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #40" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op40, xnn_delete_operator);
+
+  xnn_operator_t op41 = nullptr;
+  status = xnn_create_hardswish_nc_f32(
+    200 /* channels */,
+    200 /* input stride */,
+    200 /* output stride */,
+    0 /* flags */,
+    &op41);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #41" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op41, xnn_delete_operator);
+
+  xnn_operator_t op42 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    200 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    200 /* input pixel stride */,
+    200 /* output pixel stride */,
+    w170.data(), w171.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op42);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #42" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op42, xnn_delete_operator);
+
+  xnn_operator_t op43 = nullptr;
+  status = xnn_create_hardswish_nc_f32(
+    200 /* channels */,
+    200 /* input stride */,
+    200 /* output stride */,
+    0 /* flags */,
+    &op43);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #43" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op43, xnn_delete_operator);
+
+  xnn_operator_t op44 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    200 /* input channels per group */,
+    80 /* output_channels_per_group */,
+    200 /* input pixel stride */,
+    80 /* output pixel stride */,
+    w172.data(), w173.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op44);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #44" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op44, xnn_delete_operator);
+
+  xnn_operator_t op45 = nullptr;
+  status = xnn_create_add_nd_f32(
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op45);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #45" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op45, xnn_delete_operator);
+
+  xnn_operator_t op46 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    80 /* input channels per group */,
+    184 /* output_channels_per_group */,
+    80 /* input pixel stride */,
+    184 /* output pixel stride */,
+    w174.data(), w175.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op46);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #46" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op46, xnn_delete_operator);
+
+  xnn_operator_t op47 = nullptr;
+  status = xnn_create_hardswish_nc_f32(
+    184 /* channels */,
+    184 /* input stride */,
+    184 /* output stride */,
+    0 /* flags */,
+    &op47);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #47" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op47, xnn_delete_operator);
+
+  xnn_operator_t op48 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    184 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    184 /* input pixel stride */,
+    184 /* output pixel stride */,
+    w176.data(), w177.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op48);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #48" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op48, xnn_delete_operator);
+
+  xnn_operator_t op49 = nullptr;
+  status = xnn_create_hardswish_nc_f32(
+    184 /* channels */,
+    184 /* input stride */,
+    184 /* output stride */,
+    0 /* flags */,
+    &op49);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #49" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op49, xnn_delete_operator);
+
+  xnn_operator_t op50 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    184 /* input channels per group */,
+    80 /* output_channels_per_group */,
+    184 /* input pixel stride */,
+    80 /* output pixel stride */,
+    w178.data(), w179.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op50);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #50" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op50, xnn_delete_operator);
+
+  xnn_operator_t op51 = nullptr;
+  status = xnn_create_add_nd_f32(
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op51);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #51" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op51, xnn_delete_operator);
+
+  xnn_operator_t op52 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    80 /* input channels per group */,
+    184 /* output_channels_per_group */,
+    80 /* input pixel stride */,
+    184 /* output pixel stride */,
+    w180.data(), w181.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op52);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #52" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op52, xnn_delete_operator);
+
+  xnn_operator_t op53 = nullptr;
+  status = xnn_create_hardswish_nc_f32(
+    184 /* channels */,
+    184 /* input stride */,
+    184 /* output stride */,
+    0 /* flags */,
+    &op53);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #53" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op53, xnn_delete_operator);
+
+  xnn_operator_t op54 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    184 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    184 /* input pixel stride */,
+    184 /* output pixel stride */,
+    w182.data(), w183.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op54);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #54" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op54, xnn_delete_operator);
+
+  xnn_operator_t op55 = nullptr;
+  status = xnn_create_hardswish_nc_f32(
+    184 /* channels */,
+    184 /* input stride */,
+    184 /* output stride */,
+    0 /* flags */,
+    &op55);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #55" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op55, xnn_delete_operator);
+
+  xnn_operator_t op56 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    184 /* input channels per group */,
+    80 /* output_channels_per_group */,
+    184 /* input pixel stride */,
+    80 /* output pixel stride */,
+    w184.data(), w185.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op56);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #56" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op56, xnn_delete_operator);
+
+  xnn_operator_t op57 = nullptr;
+  status = xnn_create_add_nd_f32(
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op57);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #57" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op57, xnn_delete_operator);
+
+  xnn_operator_t op58 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    80 /* input channels per group */,
+    480 /* output_channels_per_group */,
+    80 /* input pixel stride */,
+    480 /* output pixel stride */,
+    w186.data(), w187.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op58);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #58" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op58, xnn_delete_operator);
+
+  xnn_operator_t op59 = nullptr;
+  status = xnn_create_hardswish_nc_f32(
+    480 /* channels */,
+    480 /* input stride */,
+    480 /* output stride */,
+    0 /* flags */,
+    &op59);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #59" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op59, xnn_delete_operator);
+
+  xnn_operator_t op60 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    480 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    480 /* input pixel stride */,
+    480 /* output pixel stride */,
+    w188.data(), w189.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op60);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #60" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op60, xnn_delete_operator);
+
+  xnn_operator_t op61 = nullptr;
+  status = xnn_create_hardswish_nc_f32(
+    480 /* channels */,
+    480 /* input stride */,
+    480 /* output stride */,
+    0 /* flags */,
+    &op61);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #61" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op61, xnn_delete_operator);
+
+  xnn_operator_t op62 = nullptr;
+  status = xnn_create_global_average_pooling_ncw_f32(
+    480 /* channels */,
+    -std::numeric_limits<float>::infinity(), std::numeric_limits<float>::infinity(),
+    0 /* flags */,
+    &op62);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #62" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op62, xnn_delete_operator);
+
+  xnn_operator_t op63 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    480 /* input channels per group */,
+    120 /* output_channels_per_group */,
+    480 /* input pixel stride */,
+    120 /* output pixel stride */,
+    w190.data(), w191.data(),
+    0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op63);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #63" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op63, xnn_delete_operator);
+
+  xnn_operator_t op64 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    120 /* input channels per group */,
+    480 /* output_channels_per_group */,
+    120 /* input pixel stride */,
+    480 /* output pixel stride */,
+    w192.data(), w193.data(),
+    0.0f /* output min */, +0x1.00014Fp+0 /* output max */,
+    0 /* flags */,
+    &op64);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #64" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op64, xnn_delete_operator);
+
+  xnn_operator_t op65 = nullptr;
+  status = xnn_create_multiply_nd_f32(
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op65);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #65" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op65, xnn_delete_operator);
+
+  xnn_operator_t op66 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    480 /* input channels per group */,
+    112 /* output_channels_per_group */,
+    480 /* input pixel stride */,
+    112 /* output pixel stride */,
+    w194.data(), w195.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op66);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #66" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op66, xnn_delete_operator);
+
+  xnn_operator_t op67 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    112 /* input channels per group */,
+    672 /* output_channels_per_group */,
+    112 /* input pixel stride */,
+    672 /* output pixel stride */,
+    w196.data(), w197.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op67);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #67" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op67, xnn_delete_operator);
+
+  xnn_operator_t op68 = nullptr;
+  status = xnn_create_hardswish_nc_f32(
+    672 /* channels */,
+    672 /* input stride */,
+    672 /* output stride */,
+    0 /* flags */,
+    &op68);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #68" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op68, xnn_delete_operator);
+
+  xnn_operator_t op69 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    672 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    672 /* input pixel stride */,
+    672 /* output pixel stride */,
+    w198.data(), w199.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op69);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #69" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op69, xnn_delete_operator);
+
+  xnn_operator_t op70 = nullptr;
+  status = xnn_create_hardswish_nc_f32(
+    672 /* channels */,
+    672 /* input stride */,
+    672 /* output stride */,
+    0 /* flags */,
+    &op70);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #70" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op70, xnn_delete_operator);
+
+  xnn_operator_t op71 = nullptr;
+  status = xnn_create_global_average_pooling_ncw_f32(
+    672 /* channels */,
+    -std::numeric_limits<float>::infinity(), std::numeric_limits<float>::infinity(),
+    0 /* flags */,
+    &op71);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #71" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op71, xnn_delete_operator);
+
+  xnn_operator_t op72 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    672 /* input channels per group */,
+    168 /* output_channels_per_group */,
+    672 /* input pixel stride */,
+    168 /* output pixel stride */,
+    w200.data(), w201.data(),
+    0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op72);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #72" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op72, xnn_delete_operator);
+
+  xnn_operator_t op73 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    168 /* input channels per group */,
+    672 /* output_channels_per_group */,
+    168 /* input pixel stride */,
+    672 /* output pixel stride */,
+    w202.data(), w203.data(),
+    0.0f /* output min */, +0x1.00014Fp+0 /* output max */,
+    0 /* flags */,
+    &op73);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #73" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op73, xnn_delete_operator);
+
+  xnn_operator_t op74 = nullptr;
+  status = xnn_create_multiply_nd_f32(
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op74);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #74" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op74, xnn_delete_operator);
+
+  xnn_operator_t op75 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    672 /* input channels per group */,
+    112 /* output_channels_per_group */,
+    672 /* input pixel stride */,
+    112 /* output pixel stride */,
+    w204.data(), w205.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op75);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #75" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op75, xnn_delete_operator);
+
+  xnn_operator_t op76 = nullptr;
+  status = xnn_create_add_nd_f32(
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op76);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #76" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op76, xnn_delete_operator);
+
+  xnn_operator_t op77 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    112 /* input channels per group */,
+    672 /* output_channels_per_group */,
+    112 /* input pixel stride */,
+    672 /* output pixel stride */,
+    w206.data(), w207.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op77);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #77" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op77, xnn_delete_operator);
+
+  xnn_operator_t op78 = nullptr;
+  status = xnn_create_hardswish_nc_f32(
+    672 /* channels */,
+    672 /* input stride */,
+    672 /* output stride */,
+    0 /* flags */,
+    &op78);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #78" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op78, xnn_delete_operator);
+
+  xnn_operator_t op79 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    2 /* top padding */, 2 /* right padding */,
+    2 /* bottom padding */, 2 /* left padding */,
+    5 /* kernel height */, 5 /* kernel width */,
+    2 /* subsampling height */, 2 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    672 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    672 /* input pixel stride */,
+    672 /* output pixel stride */,
+    w208.data(), w209.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op79);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #79" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op79, xnn_delete_operator);
+
+  xnn_operator_t op80 = nullptr;
+  status = xnn_create_hardswish_nc_f32(
+    672 /* channels */,
+    672 /* input stride */,
+    672 /* output stride */,
+    0 /* flags */,
+    &op80);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #80" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op80, xnn_delete_operator);
+
+  xnn_operator_t op81 = nullptr;
+  status = xnn_create_global_average_pooling_ncw_f32(
+    672 /* channels */,
+    -std::numeric_limits<float>::infinity(), std::numeric_limits<float>::infinity(),
+    0 /* flags */,
+    &op81);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #81" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op81, xnn_delete_operator);
+
+  xnn_operator_t op82 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    672 /* input channels per group */,
+    168 /* output_channels_per_group */,
+    672 /* input pixel stride */,
+    168 /* output pixel stride */,
+    w210.data(), w211.data(),
+    0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op82);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #82" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op82, xnn_delete_operator);
+
+  xnn_operator_t op83 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    168 /* input channels per group */,
+    672 /* output_channels_per_group */,
+    168 /* input pixel stride */,
+    672 /* output pixel stride */,
+    w212.data(), w213.data(),
+    0.0f /* output min */, +0x1.00014Fp+0 /* output max */,
+    0 /* flags */,
+    &op83);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #83" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op83, xnn_delete_operator);
+
+  xnn_operator_t op84 = nullptr;
+  status = xnn_create_multiply_nd_f32(
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op84);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #84" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op84, xnn_delete_operator);
+
+  xnn_operator_t op85 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    672 /* input channels per group */,
+    160 /* output_channels_per_group */,
+    672 /* input pixel stride */,
+    160 /* output pixel stride */,
+    w214.data(), w215.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op85);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #85" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op85, xnn_delete_operator);
+
+  xnn_operator_t op86 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    160 /* input channels per group */,
+    960 /* output_channels_per_group */,
+    160 /* input pixel stride */,
+    960 /* output pixel stride */,
+    w216.data(), w217.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op86);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #86" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op86, xnn_delete_operator);
+
+  xnn_operator_t op87 = nullptr;
+  status = xnn_create_hardswish_nc_f32(
+    960 /* channels */,
+    960 /* input stride */,
+    960 /* output stride */,
+    0 /* flags */,
+    &op87);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #87" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op87, xnn_delete_operator);
+
+  xnn_operator_t op88 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    2 /* top padding */, 2 /* right padding */,
+    2 /* bottom padding */, 2 /* left padding */,
+    5 /* kernel height */, 5 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    960 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    960 /* input pixel stride */,
+    960 /* output pixel stride */,
+    w218.data(), w219.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op88);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #88" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op88, xnn_delete_operator);
+
+  xnn_operator_t op89 = nullptr;
+  status = xnn_create_hardswish_nc_f32(
+    960 /* channels */,
+    960 /* input stride */,
+    960 /* output stride */,
+    0 /* flags */,
+    &op89);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #89" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op89, xnn_delete_operator);
+
+  xnn_operator_t op90 = nullptr;
+  status = xnn_create_global_average_pooling_ncw_f32(
+    960 /* channels */,
+    -std::numeric_limits<float>::infinity(), std::numeric_limits<float>::infinity(),
+    0 /* flags */,
+    &op90);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #90" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op90, xnn_delete_operator);
+
+  xnn_operator_t op91 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    960 /* input channels per group */,
+    240 /* output_channels_per_group */,
+    960 /* input pixel stride */,
+    240 /* output pixel stride */,
+    w220.data(), w221.data(),
+    0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op91);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #91" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op91, xnn_delete_operator);
+
+  xnn_operator_t op92 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    240 /* input channels per group */,
+    960 /* output_channels_per_group */,
+    240 /* input pixel stride */,
+    960 /* output pixel stride */,
+    w222.data(), w223.data(),
+    0.0f /* output min */, +0x1.00014Fp+0 /* output max */,
+    0 /* flags */,
+    &op92);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #92" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op92, xnn_delete_operator);
+
+  xnn_operator_t op93 = nullptr;
+  status = xnn_create_multiply_nd_f32(
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op93);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #93" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op93, xnn_delete_operator);
+
+  xnn_operator_t op94 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    960 /* input channels per group */,
+    160 /* output_channels_per_group */,
+    960 /* input pixel stride */,
+    160 /* output pixel stride */,
+    w224.data(), w225.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op94);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #94" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op94, xnn_delete_operator);
+
+  xnn_operator_t op95 = nullptr;
+  status = xnn_create_add_nd_f32(
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op95);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #95" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op95, xnn_delete_operator);
+
+  xnn_operator_t op96 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    160 /* input channels per group */,
+    960 /* output_channels_per_group */,
+    160 /* input pixel stride */,
+    960 /* output pixel stride */,
+    w226.data(), w227.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op96);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #96" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op96, xnn_delete_operator);
+
+  xnn_operator_t op97 = nullptr;
+  status = xnn_create_hardswish_nc_f32(
+    960 /* channels */,
+    960 /* input stride */,
+    960 /* output stride */,
+    0 /* flags */,
+    &op97);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #97" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op97, xnn_delete_operator);
+
+  xnn_operator_t op98 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    2 /* top padding */, 2 /* right padding */,
+    2 /* bottom padding */, 2 /* left padding */,
+    5 /* kernel height */, 5 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    960 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    960 /* input pixel stride */,
+    960 /* output pixel stride */,
+    w228.data(), w229.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op98);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #98" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op98, xnn_delete_operator);
+
+  xnn_operator_t op99 = nullptr;
+  status = xnn_create_hardswish_nc_f32(
+    960 /* channels */,
+    960 /* input stride */,
+    960 /* output stride */,
+    0 /* flags */,
+    &op99);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #99" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op99, xnn_delete_operator);
+
+  xnn_operator_t op100 = nullptr;
+  status = xnn_create_global_average_pooling_ncw_f32(
+    960 /* channels */,
+    -std::numeric_limits<float>::infinity(), std::numeric_limits<float>::infinity(),
+    0 /* flags */,
+    &op100);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #100" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op100, xnn_delete_operator);
+
+  xnn_operator_t op101 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    960 /* input channels per group */,
+    240 /* output_channels_per_group */,
+    960 /* input pixel stride */,
+    240 /* output pixel stride */,
+    w230.data(), w231.data(),
+    0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op101);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #101" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op101, xnn_delete_operator);
+
+  xnn_operator_t op102 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    240 /* input channels per group */,
+    960 /* output_channels_per_group */,
+    240 /* input pixel stride */,
+    960 /* output pixel stride */,
+    w232.data(), w233.data(),
+    0.0f /* output min */, +0x1.00014Fp+0 /* output max */,
+    0 /* flags */,
+    &op102);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #102" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op102, xnn_delete_operator);
+
+  xnn_operator_t op103 = nullptr;
+  status = xnn_create_multiply_nd_f32(
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op103);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #103" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op103, xnn_delete_operator);
+
+  xnn_operator_t op104 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    960 /* input channels per group */,
+    160 /* output_channels_per_group */,
+    960 /* input pixel stride */,
+    160 /* output pixel stride */,
+    w234.data(), w235.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op104);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #104" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op104, xnn_delete_operator);
+
+  xnn_operator_t op105 = nullptr;
+  status = xnn_create_add_nd_f32(
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op105);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #105" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op105, xnn_delete_operator);
+
+  xnn_operator_t op106 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    160 /* input channels per group */,
+    960 /* output_channels_per_group */,
+    160 /* input pixel stride */,
+    960 /* output pixel stride */,
+    w236.data(), w237.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op106);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #106" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op106, xnn_delete_operator);
+
+  xnn_operator_t op107 = nullptr;
+  status = xnn_create_hardswish_nc_f32(
+    960 /* channels */,
+    960 /* input stride */,
+    960 /* output stride */,
+    0 /* flags */,
+    &op107);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #107" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op107, xnn_delete_operator);
+
+  xnn_operator_t op108 = nullptr;
+  status = xnn_create_global_average_pooling_ncw_f32(
+    960 /* channels */,
+    -std::numeric_limits<float>::infinity(), std::numeric_limits<float>::infinity(),
+    0 /* flags */,
+    &op108);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #108" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op108, xnn_delete_operator);
+
+  xnn_operator_t op109 = nullptr;
+  status = xnn_create_convolution2d_nhwc_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    960 /* input channels per group */,
+    1280 /* output_channels_per_group */,
+    960 /* input pixel stride */,
+    1280 /* output pixel stride */,
+    w238.data(), w239.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op109);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #109" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op109, xnn_delete_operator);
+
+  xnn_operator_t op110 = nullptr;
+  status = xnn_create_hardswish_nc_f32(
+    1280 /* channels */,
+    1280 /* input stride */,
+    1280 /* output stride */,
+    0 /* flags */,
+    &op110);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #110" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op110, xnn_delete_operator);
+
+  xnn_operator_t op111 = nullptr;
+  status = xnn_create_global_average_pooling_nwc_f32(
+    1280 /* channels */, 1280 /* input stride */, 1280 /* output stride */,
+    -std::numeric_limits<float>::infinity(), std::numeric_limits<float>::infinity(),
+    0 /* flags */,
+    &op111);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #111" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op111, xnn_delete_operator);
+
+  xnn_operator_t op112 = nullptr;
+  status = xnn_create_convolution2d_nhwc_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    1280 /* input channels per group */,
+    1001 /* output_channels_per_group */,
+    1280 /* input pixel stride */,
+    1001 /* output pixel stride */,
+    w240.data(), w241.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op112);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #112" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op112, xnn_delete_operator);
+
+
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op0,
+    1 /* batch size */, 224 /* input height */, 224 /* input width */,
+    v0.data() /* input */, v1.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #0" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_hardswish_nc_f32(
+    op1,
+    12544 /* batch size */,
+    v1.data() /* input */, v2.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #1" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op2,
+    1 /* batch size */, 112 /* input height */, 112 /* input width */,
+    v2.data() /* input */, v3.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #2" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op3,
+    1 /* batch size */, 112 /* input height */, 112 /* input width */,
+    v3.data() /* input */, v4.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #3" << std::endl;
+    return ExecutionPlan();
+  }
+
+  {
+    const size_t a_shape[] = { 1, 16, 112, 112 };
+    const size_t b_shape[] = { 1, 16, 112, 112 };
+    status = xnn_setup_add_nd_f32(
+      op4,
+      4, a_shape, 4, b_shape,
+      v4.data() /* a */, v2.data() /* b */, v5.data() /* output */,
+      threadpool /* threadpool */);
+  }
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #4" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op5,
+    1 /* batch size */, 112 /* input height */, 112 /* input width */,
+    v5.data() /* input */, v6.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #5" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op6,
+    1 /* batch size */, 112 /* input height */, 112 /* input width */,
+    v6.data() /* input */, v7.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #6" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op7,
+    1 /* batch size */, 56 /* input height */, 56 /* input width */,
+    v7.data() /* input */, v8.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #7" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op8,
+    1 /* batch size */, 56 /* input height */, 56 /* input width */,
+    v8.data() /* input */, v9.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #8" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op9,
+    1 /* batch size */, 56 /* input height */, 56 /* input width */,
+    v9.data() /* input */, v10.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #9" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op10,
+    1 /* batch size */, 56 /* input height */, 56 /* input width */,
+    v10.data() /* input */, v11.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #10" << std::endl;
+    return ExecutionPlan();
+  }
+
+  {
+    const size_t a_shape[] = { 1, 24, 56, 56 };
+    const size_t b_shape[] = { 1, 24, 56, 56 };
+    status = xnn_setup_add_nd_f32(
+      op11,
+      4, a_shape, 4, b_shape,
+      v11.data() /* a */, v8.data() /* b */, v12.data() /* output */,
+      threadpool /* threadpool */);
+  }
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #11" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op12,
+    1 /* batch size */, 56 /* input height */, 56 /* input width */,
+    v12.data() /* input */, v13.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #12" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op13,
+    1 /* batch size */, 56 /* input height */, 56 /* input width */,
+    v13.data() /* input */, v14.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #13" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_global_average_pooling_ncw_f32(
+    op14,
+    1 /* batch size */, 784 /* width */,
+    v14.data() /* input */, v15.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #14" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op15,
+    1 /* batch size */, 1 /* input height */, 1 /* input width */,
+    v15.data() /* input */, v16.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #15" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op16,
+    1 /* batch size */, 1 /* input height */, 1 /* input width */,
+    v16.data() /* input */, v17.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #16" << std::endl;
+    return ExecutionPlan();
+  }
+
+  {
+    const size_t a_shape[] = { 1, 72, 28, 28 };
+    const size_t b_shape[] = { 1, 72, 1, 1 };
+    status = xnn_setup_multiply_nd_f32(
+      op17,
+      4, a_shape, 4, b_shape,
+      v14.data() /* a */, v17.data() /* b */, v18.data() /* output */,
+      threadpool /* threadpool */);
+  }
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #17" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op18,
+    1 /* batch size */, 28 /* input height */, 28 /* input width */,
+    v18.data() /* input */, v19.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #18" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op19,
+    1 /* batch size */, 28 /* input height */, 28 /* input width */,
+    v19.data() /* input */, v20.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #19" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op20,
+    1 /* batch size */, 28 /* input height */, 28 /* input width */,
+    v20.data() /* input */, v21.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #20" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_global_average_pooling_ncw_f32(
+    op21,
+    1 /* batch size */, 784 /* width */,
+    v21.data() /* input */, v22.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #21" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op22,
+    1 /* batch size */, 1 /* input height */, 1 /* input width */,
+    v22.data() /* input */, v23.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #22" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op23,
+    1 /* batch size */, 1 /* input height */, 1 /* input width */,
+    v23.data() /* input */, v24.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #23" << std::endl;
+    return ExecutionPlan();
+  }
+
+  {
+    const size_t a_shape[] = { 1, 120, 28, 28 };
+    const size_t b_shape[] = { 1, 120, 1, 1 };
+    status = xnn_setup_multiply_nd_f32(
+      op24,
+      4, a_shape, 4, b_shape,
+      v21.data() /* a */, v24.data() /* b */, v25.data() /* output */,
+      threadpool /* threadpool */);
+  }
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #24" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op25,
+    1 /* batch size */, 28 /* input height */, 28 /* input width */,
+    v25.data() /* input */, v26.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #25" << std::endl;
+    return ExecutionPlan();
+  }
+
+  {
+    const size_t a_shape[] = { 1, 40, 28, 28 };
+    const size_t b_shape[] = { 1, 40, 28, 28 };
+    status = xnn_setup_add_nd_f32(
+      op26,
+      4, a_shape, 4, b_shape,
+      v26.data() /* a */, v19.data() /* b */, v27.data() /* output */,
+      threadpool /* threadpool */);
+  }
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #26" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op27,
+    1 /* batch size */, 28 /* input height */, 28 /* input width */,
+    v27.data() /* input */, v28.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #27" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op28,
+    1 /* batch size */, 28 /* input height */, 28 /* input width */,
+    v28.data() /* input */, v29.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #28" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_global_average_pooling_ncw_f32(
+    op29,
+    1 /* batch size */, 784 /* width */,
+    v29.data() /* input */, v30.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #29" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op30,
+    1 /* batch size */, 1 /* input height */, 1 /* input width */,
+    v30.data() /* input */, v31.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #30" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op31,
+    1 /* batch size */, 1 /* input height */, 1 /* input width */,
+    v31.data() /* input */, v32.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #31" << std::endl;
+    return ExecutionPlan();
+  }
+
+  {
+    const size_t a_shape[] = { 1, 120, 28, 28 };
+    const size_t b_shape[] = { 1, 120, 1, 1 };
+    status = xnn_setup_multiply_nd_f32(
+      op32,
+      4, a_shape, 4, b_shape,
+      v29.data() /* a */, v32.data() /* b */, v33.data() /* output */,
+      threadpool /* threadpool */);
+  }
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #32" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op33,
+    1 /* batch size */, 28 /* input height */, 28 /* input width */,
+    v33.data() /* input */, v34.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #33" << std::endl;
+    return ExecutionPlan();
+  }
+
+  {
+    const size_t a_shape[] = { 1, 40, 28, 28 };
+    const size_t b_shape[] = { 1, 40, 28, 28 };
+    status = xnn_setup_add_nd_f32(
+      op34,
+      4, a_shape, 4, b_shape,
+      v34.data() /* a */, v27.data() /* b */, v35.data() /* output */,
+      threadpool /* threadpool */);
+  }
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #34" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op35,
+    1 /* batch size */, 28 /* input height */, 28 /* input width */,
+    v35.data() /* input */, v36.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #35" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_hardswish_nc_f32(
+    op36,
+    784 /* batch size */,
+    v36.data() /* input */, v37.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #36" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op37,
+    1 /* batch size */, 28 /* input height */, 28 /* input width */,
+    v37.data() /* input */, v38.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #37" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_hardswish_nc_f32(
+    op38,
+    196 /* batch size */,
+    v38.data() /* input */, v39.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #38" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op39,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v39.data() /* input */, v40.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #39" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op40,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v40.data() /* input */, v41.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #40" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_hardswish_nc_f32(
+    op41,
+    196 /* batch size */,
+    v41.data() /* input */, v42.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #41" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op42,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v42.data() /* input */, v43.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #42" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_hardswish_nc_f32(
+    op43,
+    196 /* batch size */,
+    v43.data() /* input */, v44.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #43" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op44,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v44.data() /* input */, v45.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #44" << std::endl;
+    return ExecutionPlan();
+  }
+
+  {
+    const size_t a_shape[] = { 1, 80, 14, 14 };
+    const size_t b_shape[] = { 1, 80, 14, 14 };
+    status = xnn_setup_add_nd_f32(
+      op45,
+      4, a_shape, 4, b_shape,
+      v45.data() /* a */, v40.data() /* b */, v46.data() /* output */,
+      threadpool /* threadpool */);
+  }
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #45" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op46,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v46.data() /* input */, v47.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #46" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_hardswish_nc_f32(
+    op47,
+    196 /* batch size */,
+    v47.data() /* input */, v48.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #47" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op48,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v48.data() /* input */, v49.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #48" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_hardswish_nc_f32(
+    op49,
+    196 /* batch size */,
+    v49.data() /* input */, v50.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #49" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op50,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v50.data() /* input */, v51.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #50" << std::endl;
+    return ExecutionPlan();
+  }
+
+  {
+    const size_t a_shape[] = { 1, 80, 14, 14 };
+    const size_t b_shape[] = { 1, 80, 14, 14 };
+    status = xnn_setup_add_nd_f32(
+      op51,
+      4, a_shape, 4, b_shape,
+      v51.data() /* a */, v46.data() /* b */, v52.data() /* output */,
+      threadpool /* threadpool */);
+  }
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #51" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op52,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v52.data() /* input */, v53.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #52" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_hardswish_nc_f32(
+    op53,
+    196 /* batch size */,
+    v53.data() /* input */, v54.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #53" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op54,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v54.data() /* input */, v55.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #54" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_hardswish_nc_f32(
+    op55,
+    196 /* batch size */,
+    v55.data() /* input */, v56.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #55" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op56,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v56.data() /* input */, v57.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #56" << std::endl;
+    return ExecutionPlan();
+  }
+
+  {
+    const size_t a_shape[] = { 1, 80, 14, 14 };
+    const size_t b_shape[] = { 1, 80, 14, 14 };
+    status = xnn_setup_add_nd_f32(
+      op57,
+      4, a_shape, 4, b_shape,
+      v57.data() /* a */, v52.data() /* b */, v58.data() /* output */,
+      threadpool /* threadpool */);
+  }
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #57" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op58,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v58.data() /* input */, v59.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #58" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_hardswish_nc_f32(
+    op59,
+    196 /* batch size */,
+    v59.data() /* input */, v60.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #59" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op60,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v60.data() /* input */, v61.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #60" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_hardswish_nc_f32(
+    op61,
+    196 /* batch size */,
+    v61.data() /* input */, v62.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #61" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_global_average_pooling_ncw_f32(
+    op62,
+    1 /* batch size */, 196 /* width */,
+    v62.data() /* input */, v63.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #62" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op63,
+    1 /* batch size */, 1 /* input height */, 1 /* input width */,
+    v63.data() /* input */, v64.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #63" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op64,
+    1 /* batch size */, 1 /* input height */, 1 /* input width */,
+    v64.data() /* input */, v65.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #64" << std::endl;
+    return ExecutionPlan();
+  }
+
+  {
+    const size_t a_shape[] = { 1, 480, 14, 14 };
+    const size_t b_shape[] = { 1, 480, 1, 1 };
+    status = xnn_setup_multiply_nd_f32(
+      op65,
+      4, a_shape, 4, b_shape,
+      v62.data() /* a */, v65.data() /* b */, v66.data() /* output */,
+      threadpool /* threadpool */);
+  }
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #65" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op66,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v66.data() /* input */, v67.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #66" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op67,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v67.data() /* input */, v68.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #67" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_hardswish_nc_f32(
+    op68,
+    196 /* batch size */,
+    v68.data() /* input */, v69.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #68" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op69,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v69.data() /* input */, v70.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #69" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_hardswish_nc_f32(
+    op70,
+    196 /* batch size */,
+    v70.data() /* input */, v71.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #70" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_global_average_pooling_ncw_f32(
+    op71,
+    1 /* batch size */, 196 /* width */,
+    v71.data() /* input */, v72.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #71" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op72,
+    1 /* batch size */, 1 /* input height */, 1 /* input width */,
+    v72.data() /* input */, v73.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #72" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op73,
+    1 /* batch size */, 1 /* input height */, 1 /* input width */,
+    v73.data() /* input */, v74.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #73" << std::endl;
+    return ExecutionPlan();
+  }
+
+  {
+    const size_t a_shape[] = { 1, 672, 14, 14 };
+    const size_t b_shape[] = { 1, 672, 1, 1 };
+    status = xnn_setup_multiply_nd_f32(
+      op74,
+      4, a_shape, 4, b_shape,
+      v71.data() /* a */, v74.data() /* b */, v75.data() /* output */,
+      threadpool /* threadpool */);
+  }
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #74" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op75,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v75.data() /* input */, v76.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #75" << std::endl;
+    return ExecutionPlan();
+  }
+
+  {
+    const size_t a_shape[] = { 1, 112, 14, 14 };
+    const size_t b_shape[] = { 1, 112, 14, 14 };
+    status = xnn_setup_add_nd_f32(
+      op76,
+      4, a_shape, 4, b_shape,
+      v76.data() /* a */, v67.data() /* b */, v77.data() /* output */,
+      threadpool /* threadpool */);
+  }
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #76" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op77,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v77.data() /* input */, v78.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #77" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_hardswish_nc_f32(
+    op78,
+    196 /* batch size */,
+    v78.data() /* input */, v79.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #78" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op79,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v79.data() /* input */, v80.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #79" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_hardswish_nc_f32(
+    op80,
+    49 /* batch size */,
+    v80.data() /* input */, v81.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #80" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_global_average_pooling_ncw_f32(
+    op81,
+    1 /* batch size */, 49 /* width */,
+    v81.data() /* input */, v82.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #81" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op82,
+    1 /* batch size */, 1 /* input height */, 1 /* input width */,
+    v82.data() /* input */, v83.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #82" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op83,
+    1 /* batch size */, 1 /* input height */, 1 /* input width */,
+    v83.data() /* input */, v84.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #83" << std::endl;
+    return ExecutionPlan();
+  }
+
+  {
+    const size_t a_shape[] = { 1, 672, 7, 7 };
+    const size_t b_shape[] = { 1, 672, 1, 1 };
+    status = xnn_setup_multiply_nd_f32(
+      op84,
+      4, a_shape, 4, b_shape,
+      v81.data() /* a */, v84.data() /* b */, v85.data() /* output */,
+      threadpool /* threadpool */);
+  }
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #84" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op85,
+    1 /* batch size */, 7 /* input height */, 7 /* input width */,
+    v85.data() /* input */, v86.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #85" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op86,
+    1 /* batch size */, 7 /* input height */, 7 /* input width */,
+    v86.data() /* input */, v87.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #86" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_hardswish_nc_f32(
+    op87,
+    49 /* batch size */,
+    v87.data() /* input */, v88.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #87" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op88,
+    1 /* batch size */, 7 /* input height */, 7 /* input width */,
+    v88.data() /* input */, v89.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #88" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_hardswish_nc_f32(
+    op89,
+    49 /* batch size */,
+    v89.data() /* input */, v90.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #89" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_global_average_pooling_ncw_f32(
+    op90,
+    1 /* batch size */, 49 /* width */,
+    v90.data() /* input */, v91.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #90" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op91,
+    1 /* batch size */, 1 /* input height */, 1 /* input width */,
+    v91.data() /* input */, v92.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #91" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op92,
+    1 /* batch size */, 1 /* input height */, 1 /* input width */,
+    v92.data() /* input */, v93.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #92" << std::endl;
+    return ExecutionPlan();
+  }
+
+  {
+    const size_t a_shape[] = { 1, 960, 7, 7 };
+    const size_t b_shape[] = { 1, 960, 1, 1 };
+    status = xnn_setup_multiply_nd_f32(
+      op93,
+      4, a_shape, 4, b_shape,
+      v90.data() /* a */, v93.data() /* b */, v94.data() /* output */,
+      threadpool /* threadpool */);
+  }
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #93" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op94,
+    1 /* batch size */, 7 /* input height */, 7 /* input width */,
+    v94.data() /* input */, v95.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #94" << std::endl;
+    return ExecutionPlan();
+  }
+
+  {
+    const size_t a_shape[] = { 1, 160, 7, 7 };
+    const size_t b_shape[] = { 1, 160, 7, 7 };
+    status = xnn_setup_add_nd_f32(
+      op95,
+      4, a_shape, 4, b_shape,
+      v95.data() /* a */, v86.data() /* b */, v96.data() /* output */,
+      threadpool /* threadpool */);
+  }
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #95" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op96,
+    1 /* batch size */, 7 /* input height */, 7 /* input width */,
+    v96.data() /* input */, v97.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #96" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_hardswish_nc_f32(
+    op97,
+    49 /* batch size */,
+    v97.data() /* input */, v98.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #97" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op98,
+    1 /* batch size */, 7 /* input height */, 7 /* input width */,
+    v98.data() /* input */, v99.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #98" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_hardswish_nc_f32(
+    op99,
+    49 /* batch size */,
+    v99.data() /* input */, v100.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #99" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_global_average_pooling_ncw_f32(
+    op100,
+    1 /* batch size */, 49 /* width */,
+    v100.data() /* input */, v101.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #100" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op101,
+    1 /* batch size */, 1 /* input height */, 1 /* input width */,
+    v101.data() /* input */, v102.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #101" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op102,
+    1 /* batch size */, 1 /* input height */, 1 /* input width */,
+    v102.data() /* input */, v103.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #102" << std::endl;
+    return ExecutionPlan();
+  }
+
+  {
+    const size_t a_shape[] = { 1, 960, 7, 7 };
+    const size_t b_shape[] = { 1, 960, 1, 1 };
+    status = xnn_setup_multiply_nd_f32(
+      op103,
+      4, a_shape, 4, b_shape,
+      v100.data() /* a */, v103.data() /* b */, v104.data() /* output */,
+      threadpool /* threadpool */);
+  }
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #103" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op104,
+    1 /* batch size */, 7 /* input height */, 7 /* input width */,
+    v104.data() /* input */, v105.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #104" << std::endl;
+    return ExecutionPlan();
+  }
+
+  {
+    const size_t a_shape[] = { 1, 160, 7, 7 };
+    const size_t b_shape[] = { 1, 160, 7, 7 };
+    status = xnn_setup_add_nd_f32(
+      op105,
+      4, a_shape, 4, b_shape,
+      v105.data() /* a */, v96.data() /* b */, v106.data() /* output */,
+      threadpool /* threadpool */);
+  }
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #105" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op106,
+    1 /* batch size */, 7 /* input height */, 7 /* input width */,
+    v106.data() /* input */, v107.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #106" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_hardswish_nc_f32(
+    op107,
+    49 /* batch size */,
+    v107.data() /* input */, v108.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #107" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_global_average_pooling_ncw_f32(
+    op108,
+    1 /* batch size */, 49 /* width */,
+    v108.data() /* input */, v109.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #108" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nhwc_f32(
+    op109,
+    1 /* batch size */, 1 /* input height */, 1 /* input width */,
+    v109.data() /* input */, v110.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #109" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_hardswish_nc_f32(
+    op110,
+    1 /* batch size */,
+    v110.data() /* input */, v111.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #110" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_global_average_pooling_nwc_f32(
+    op111,
+    1 /* batch size */, 1 /* width */,
+    v111.data() /* input */, v112.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #111" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nhwc_f32(
+    op112,
+    1 /* batch size */, 1 /* input height */, 1 /* input width */,
+    v112.data() /* input */, v113.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #112" << std::endl;
+    return ExecutionPlan();
+  }
+
+  #pragma clang diagnostic push
+  #pragma clang diagnostic ignored "-Wpessimizing-move"
+  return operators;
+  #pragma clang diagnostic pop
+}
+
+}  // namespace models
diff --git a/models/fp32-sparse-mobilenet-v3-small.cc b/models/fp32-sparse-mobilenet-v3-small.cc
new file mode 100644
index 0000000..7d1260c
--- /dev/null
+++ b/models/fp32-sparse-mobilenet-v3-small.cc
@@ -0,0 +1,3316 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack.h>
+
+#include <array>
+#include <algorithm>
+#include <functional>
+#include <iostream>
+#include <limits>
+#include <random>
+
+#include "models/models.h"
+
+namespace models {
+
+ExecutionPlan FP32SparseMobileNetV3Small(float sparsity, pthreadpool_t threadpool) {
+  alignas(16) static std::array<float, 150528> v0;
+  alignas(16) static std::array<float, 200704> v1;
+  alignas(16) static std::array<float, 200704> v2;
+  alignas(16) static std::array<float, 50176> v3;
+  alignas(16) static std::array<float, 16> v4;
+  alignas(16) static std::array<float, 8> v5;
+  alignas(16) static std::array<float, 16> v6;
+  alignas(16) static std::array<float, 50176> v7;
+  alignas(16) static std::array<float, 50176> v8;
+  alignas(16) static std::array<float, 225792> v9;
+  alignas(16) static std::array<float, 56448> v10;
+  alignas(16) static std::array<float, 18816> v11;
+  alignas(16) static std::array<float, 68992> v12;
+  alignas(16) static std::array<float, 68992> v13;
+  alignas(16) static std::array<float, 18816> v14;
+  alignas(16) static std::array<float, 18816> v15;
+  alignas(16) static std::array<float, 75264> v16;
+  alignas(16) static std::array<float, 75264> v17;
+  alignas(16) static std::array<float, 18816> v18;
+  alignas(16) static std::array<float, 18816> v19;
+  alignas(16) static std::array<float, 96> v20;
+  alignas(16) static std::array<float, 24> v21;
+  alignas(16) static std::array<float, 96> v22;
+  alignas(16) static std::array<float, 18816> v23;
+  alignas(16) static std::array<float, 7840> v24;
+  alignas(16) static std::array<float, 47040> v25;
+  alignas(16) static std::array<float, 47040> v26;
+  alignas(16) static std::array<float, 47040> v27;
+  alignas(16) static std::array<float, 47040> v28;
+  alignas(16) static std::array<float, 240> v29;
+  alignas(16) static std::array<float, 64> v30;
+  alignas(16) static std::array<float, 240> v31;
+  alignas(16) static std::array<float, 47040> v32;
+  alignas(16) static std::array<float, 7840> v33;
+  alignas(16) static std::array<float, 7840> v34;
+  alignas(16) static std::array<float, 47040> v35;
+  alignas(16) static std::array<float, 47040> v36;
+  alignas(16) static std::array<float, 47040> v37;
+  alignas(16) static std::array<float, 47040> v38;
+  alignas(16) static std::array<float, 240> v39;
+  alignas(16) static std::array<float, 64> v40;
+  alignas(16) static std::array<float, 240> v41;
+  alignas(16) static std::array<float, 47040> v42;
+  alignas(16) static std::array<float, 7840> v43;
+  alignas(16) static std::array<float, 7840> v44;
+  alignas(16) static std::array<float, 23520> v45;
+  alignas(16) static std::array<float, 23520> v46;
+  alignas(16) static std::array<float, 23520> v47;
+  alignas(16) static std::array<float, 23520> v48;
+  alignas(16) static std::array<float, 120> v49;
+  alignas(16) static std::array<float, 32> v50;
+  alignas(16) static std::array<float, 120> v51;
+  alignas(16) static std::array<float, 23520> v52;
+  alignas(16) static std::array<float, 9408> v53;
+  alignas(16) static std::array<float, 28224> v54;
+  alignas(16) static std::array<float, 28224> v55;
+  alignas(16) static std::array<float, 28224> v56;
+  alignas(16) static std::array<float, 28224> v57;
+  alignas(16) static std::array<float, 144> v58;
+  alignas(16) static std::array<float, 40> v59;
+  alignas(16) static std::array<float, 144> v60;
+  alignas(16) static std::array<float, 28224> v61;
+  alignas(16) static std::array<float, 9408> v62;
+  alignas(16) static std::array<float, 9408> v63;
+  alignas(16) static std::array<float, 56448> v64;
+  alignas(16) static std::array<float, 56448> v65;
+  alignas(16) static std::array<float, 14112> v66;
+  alignas(16) static std::array<float, 14112> v67;
+  alignas(16) static std::array<float, 288> v68;
+  alignas(16) static std::array<float, 72> v69;
+  alignas(16) static std::array<float, 288> v70;
+  alignas(16) static std::array<float, 14112> v71;
+  alignas(16) static std::array<float, 4704> v72;
+  alignas(16) static std::array<float, 28224> v73;
+  alignas(16) static std::array<float, 28224> v74;
+  alignas(16) static std::array<float, 28224> v75;
+  alignas(16) static std::array<float, 28224> v76;
+  alignas(16) static std::array<float, 576> v77;
+  alignas(16) static std::array<float, 144> v78;
+  alignas(16) static std::array<float, 576> v79;
+  alignas(16) static std::array<float, 28224> v80;
+  alignas(16) static std::array<float, 4704> v81;
+  alignas(16) static std::array<float, 4704> v82;
+  alignas(16) static std::array<float, 28224> v83;
+  alignas(16) static std::array<float, 28224> v84;
+  alignas(16) static std::array<float, 28224> v85;
+  alignas(16) static std::array<float, 28224> v86;
+  alignas(16) static std::array<float, 576> v87;
+  alignas(16) static std::array<float, 144> v88;
+  alignas(16) static std::array<float, 576> v89;
+  alignas(16) static std::array<float, 28224> v90;
+  alignas(16) static std::array<float, 4704> v91;
+  alignas(16) static std::array<float, 4704> v92;
+  alignas(16) static std::array<float, 28224> v93;
+  alignas(16) static std::array<float, 28224> v94;
+  alignas(16) static std::array<float, 576> v95;
+  alignas(16) static std::array<float, 1024> v96;
+  alignas(16) static std::array<float, 1024> v97;
+  alignas(16) static std::array<float, 1024> v98;
+  alignas(16) static std::array<float, 1001> v99;
+  alignas(16) static std::array<float, 432> w100;
+  alignas(16) static std::array<float, 16> w101;
+  alignas(16) static std::array<float, 144> w102;
+  alignas(16) static std::array<float, 16> w103;
+  alignas(16) static std::array<float, 128> w104;
+  alignas(16) static std::array<float, 8> w105;
+  alignas(16) static std::array<float, 128> w106;
+  alignas(16) static std::array<float, 16> w107;
+  alignas(16) static std::array<float, 256> w108;
+  alignas(16) static std::array<float, 16> w109;
+  alignas(16) static std::array<float, 1152> w110;
+  alignas(16) static std::array<float, 72> w111;
+  alignas(16) static std::array<float, 648> w112;
+  alignas(16) static std::array<float, 72> w113;
+  alignas(16) static std::array<float, 1728> w114;
+  alignas(16) static std::array<float, 24> w115;
+  alignas(16) static std::array<float, 2112> w116;
+  alignas(16) static std::array<float, 88> w117;
+  alignas(16) static std::array<float, 792> w118;
+  alignas(16) static std::array<float, 88> w119;
+  alignas(16) static std::array<float, 2112> w120;
+  alignas(16) static std::array<float, 24> w121;
+  alignas(16) static std::array<float, 2304> w122;
+  alignas(16) static std::array<float, 96> w123;
+  alignas(16) static std::array<float, 2400> w124;
+  alignas(16) static std::array<float, 96> w125;
+  alignas(16) static std::array<float, 2304> w126;
+  alignas(16) static std::array<float, 24> w127;
+  alignas(16) static std::array<float, 2304> w128;
+  alignas(16) static std::array<float, 96> w129;
+  alignas(16) static std::array<float, 3840> w130;
+  alignas(16) static std::array<float, 40> w131;
+  alignas(16) static std::array<float, 9600> w132;
+  alignas(16) static std::array<float, 240> w133;
+  alignas(16) static std::array<float, 6000> w134;
+  alignas(16) static std::array<float, 240> w135;
+  alignas(16) static std::array<float, 15360> w136;
+  alignas(16) static std::array<float, 64> w137;
+  alignas(16) static std::array<float, 15360> w138;
+  alignas(16) static std::array<float, 240> w139;
+  alignas(16) static std::array<float, 9600> w140;
+  alignas(16) static std::array<float, 40> w141;
+  alignas(16) static std::array<float, 9600> w142;
+  alignas(16) static std::array<float, 240> w143;
+  alignas(16) static std::array<float, 6000> w144;
+  alignas(16) static std::array<float, 240> w145;
+  alignas(16) static std::array<float, 15360> w146;
+  alignas(16) static std::array<float, 64> w147;
+  alignas(16) static std::array<float, 15360> w148;
+  alignas(16) static std::array<float, 240> w149;
+  alignas(16) static std::array<float, 9600> w150;
+  alignas(16) static std::array<float, 40> w151;
+  alignas(16) static std::array<float, 4800> w152;
+  alignas(16) static std::array<float, 120> w153;
+  alignas(16) static std::array<float, 3000> w154;
+  alignas(16) static std::array<float, 120> w155;
+  alignas(16) static std::array<float, 3840> w156;
+  alignas(16) static std::array<float, 32> w157;
+  alignas(16) static std::array<float, 3840> w158;
+  alignas(16) static std::array<float, 120> w159;
+  alignas(16) static std::array<float, 5760> w160;
+  alignas(16) static std::array<float, 48> w161;
+  alignas(16) static std::array<float, 6912> w162;
+  alignas(16) static std::array<float, 144> w163;
+  alignas(16) static std::array<float, 3600> w164;
+  alignas(16) static std::array<float, 144> w165;
+  alignas(16) static std::array<float, 5760> w166;
+  alignas(16) static std::array<float, 40> w167;
+  alignas(16) static std::array<float, 5760> w168;
+  alignas(16) static std::array<float, 144> w169;
+  alignas(16) static std::array<float, 6912> w170;
+  alignas(16) static std::array<float, 48> w171;
+  alignas(16) static std::array<float, 13824> w172;
+  alignas(16) static std::array<float, 288> w173;
+  alignas(16) static std::array<float, 7200> w174;
+  alignas(16) static std::array<float, 288> w175;
+  alignas(16) static std::array<float, 20736> w176;
+  alignas(16) static std::array<float, 72> w177;
+  alignas(16) static std::array<float, 20736> w178;
+  alignas(16) static std::array<float, 288> w179;
+  alignas(16) static std::array<float, 27648> w180;
+  alignas(16) static std::array<float, 96> w181;
+  alignas(16) static std::array<float, 55296> w182;
+  alignas(16) static std::array<float, 576> w183;
+  alignas(16) static std::array<float, 14400> w184;
+  alignas(16) static std::array<float, 576> w185;
+  alignas(16) static std::array<float, 82944> w186;
+  alignas(16) static std::array<float, 144> w187;
+  alignas(16) static std::array<float, 82944> w188;
+  alignas(16) static std::array<float, 576> w189;
+  alignas(16) static std::array<float, 55296> w190;
+  alignas(16) static std::array<float, 96> w191;
+  alignas(16) static std::array<float, 55296> w192;
+  alignas(16) static std::array<float, 576> w193;
+  alignas(16) static std::array<float, 14400> w194;
+  alignas(16) static std::array<float, 576> w195;
+  alignas(16) static std::array<float, 82944> w196;
+  alignas(16) static std::array<float, 144> w197;
+  alignas(16) static std::array<float, 82944> w198;
+  alignas(16) static std::array<float, 576> w199;
+  alignas(16) static std::array<float, 55296> w200;
+  alignas(16) static std::array<float, 96> w201;
+  alignas(16) static std::array<float, 55296> w202;
+  alignas(16) static std::array<float, 576> w203;
+  alignas(16) static std::array<float, 589824> w204;
+  alignas(16) static std::array<float, 1024> w205;
+  alignas(16) static std::array<float, 1025024> w206;
+  alignas(16) static std::array<float, 1001> w207;
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.0f, +1.0f), std::ref(rng));
+  std::generate(v0.begin(), v0.end(), std::ref(f32rng));
+  std::generate(v1.begin(), v1.end(), std::ref(f32rng));
+  std::generate(v2.begin(), v2.end(), std::ref(f32rng));
+  std::generate(v3.begin(), v3.end(), std::ref(f32rng));
+  std::generate(v4.begin(), v4.end(), std::ref(f32rng));
+  std::generate(v5.begin(), v5.end(), std::ref(f32rng));
+  std::generate(v6.begin(), v6.end(), std::ref(f32rng));
+  std::generate(v7.begin(), v7.end(), std::ref(f32rng));
+  std::generate(v8.begin(), v8.end(), std::ref(f32rng));
+  std::generate(v9.begin(), v9.end(), std::ref(f32rng));
+  std::generate(v10.begin(), v10.end(), std::ref(f32rng));
+  std::generate(v11.begin(), v11.end(), std::ref(f32rng));
+  std::generate(v12.begin(), v12.end(), std::ref(f32rng));
+  std::generate(v13.begin(), v13.end(), std::ref(f32rng));
+  std::generate(v14.begin(), v14.end(), std::ref(f32rng));
+  std::generate(v15.begin(), v15.end(), std::ref(f32rng));
+  std::generate(v16.begin(), v16.end(), std::ref(f32rng));
+  std::generate(v17.begin(), v17.end(), std::ref(f32rng));
+  std::generate(v18.begin(), v18.end(), std::ref(f32rng));
+  std::generate(v19.begin(), v19.end(), std::ref(f32rng));
+  std::generate(v20.begin(), v20.end(), std::ref(f32rng));
+  std::generate(v21.begin(), v21.end(), std::ref(f32rng));
+  std::generate(v22.begin(), v22.end(), std::ref(f32rng));
+  std::generate(v23.begin(), v23.end(), std::ref(f32rng));
+  std::generate(v24.begin(), v24.end(), std::ref(f32rng));
+  std::generate(v25.begin(), v25.end(), std::ref(f32rng));
+  std::generate(v26.begin(), v26.end(), std::ref(f32rng));
+  std::generate(v27.begin(), v27.end(), std::ref(f32rng));
+  std::generate(v28.begin(), v28.end(), std::ref(f32rng));
+  std::generate(v29.begin(), v29.end(), std::ref(f32rng));
+  std::generate(v30.begin(), v30.end(), std::ref(f32rng));
+  std::generate(v31.begin(), v31.end(), std::ref(f32rng));
+  std::generate(v32.begin(), v32.end(), std::ref(f32rng));
+  std::generate(v33.begin(), v33.end(), std::ref(f32rng));
+  std::generate(v34.begin(), v34.end(), std::ref(f32rng));
+  std::generate(v35.begin(), v35.end(), std::ref(f32rng));
+  std::generate(v36.begin(), v36.end(), std::ref(f32rng));
+  std::generate(v37.begin(), v37.end(), std::ref(f32rng));
+  std::generate(v38.begin(), v38.end(), std::ref(f32rng));
+  std::generate(v39.begin(), v39.end(), std::ref(f32rng));
+  std::generate(v40.begin(), v40.end(), std::ref(f32rng));
+  std::generate(v41.begin(), v41.end(), std::ref(f32rng));
+  std::generate(v42.begin(), v42.end(), std::ref(f32rng));
+  std::generate(v43.begin(), v43.end(), std::ref(f32rng));
+  std::generate(v44.begin(), v44.end(), std::ref(f32rng));
+  std::generate(v45.begin(), v45.end(), std::ref(f32rng));
+  std::generate(v46.begin(), v46.end(), std::ref(f32rng));
+  std::generate(v47.begin(), v47.end(), std::ref(f32rng));
+  std::generate(v48.begin(), v48.end(), std::ref(f32rng));
+  std::generate(v49.begin(), v49.end(), std::ref(f32rng));
+  std::generate(v50.begin(), v50.end(), std::ref(f32rng));
+  std::generate(v51.begin(), v51.end(), std::ref(f32rng));
+  std::generate(v52.begin(), v52.end(), std::ref(f32rng));
+  std::generate(v53.begin(), v53.end(), std::ref(f32rng));
+  std::generate(v54.begin(), v54.end(), std::ref(f32rng));
+  std::generate(v55.begin(), v55.end(), std::ref(f32rng));
+  std::generate(v56.begin(), v56.end(), std::ref(f32rng));
+  std::generate(v57.begin(), v57.end(), std::ref(f32rng));
+  std::generate(v58.begin(), v58.end(), std::ref(f32rng));
+  std::generate(v59.begin(), v59.end(), std::ref(f32rng));
+  std::generate(v60.begin(), v60.end(), std::ref(f32rng));
+  std::generate(v61.begin(), v61.end(), std::ref(f32rng));
+  std::generate(v62.begin(), v62.end(), std::ref(f32rng));
+  std::generate(v63.begin(), v63.end(), std::ref(f32rng));
+  std::generate(v64.begin(), v64.end(), std::ref(f32rng));
+  std::generate(v65.begin(), v65.end(), std::ref(f32rng));
+  std::generate(v66.begin(), v66.end(), std::ref(f32rng));
+  std::generate(v67.begin(), v67.end(), std::ref(f32rng));
+  std::generate(v68.begin(), v68.end(), std::ref(f32rng));
+  std::generate(v69.begin(), v69.end(), std::ref(f32rng));
+  std::generate(v70.begin(), v70.end(), std::ref(f32rng));
+  std::generate(v71.begin(), v71.end(), std::ref(f32rng));
+  std::generate(v72.begin(), v72.end(), std::ref(f32rng));
+  std::generate(v73.begin(), v73.end(), std::ref(f32rng));
+  std::generate(v74.begin(), v74.end(), std::ref(f32rng));
+  std::generate(v75.begin(), v75.end(), std::ref(f32rng));
+  std::generate(v76.begin(), v76.end(), std::ref(f32rng));
+  std::generate(v77.begin(), v77.end(), std::ref(f32rng));
+  std::generate(v78.begin(), v78.end(), std::ref(f32rng));
+  std::generate(v79.begin(), v79.end(), std::ref(f32rng));
+  std::generate(v80.begin(), v80.end(), std::ref(f32rng));
+  std::generate(v81.begin(), v81.end(), std::ref(f32rng));
+  std::generate(v82.begin(), v82.end(), std::ref(f32rng));
+  std::generate(v83.begin(), v83.end(), std::ref(f32rng));
+  std::generate(v84.begin(), v84.end(), std::ref(f32rng));
+  std::generate(v85.begin(), v85.end(), std::ref(f32rng));
+  std::generate(v86.begin(), v86.end(), std::ref(f32rng));
+  std::generate(v87.begin(), v87.end(), std::ref(f32rng));
+  std::generate(v88.begin(), v88.end(), std::ref(f32rng));
+  std::generate(v89.begin(), v89.end(), std::ref(f32rng));
+  std::generate(v90.begin(), v90.end(), std::ref(f32rng));
+  std::generate(v91.begin(), v91.end(), std::ref(f32rng));
+  std::generate(v92.begin(), v92.end(), std::ref(f32rng));
+  std::generate(v93.begin(), v93.end(), std::ref(f32rng));
+  std::generate(v94.begin(), v94.end(), std::ref(f32rng));
+  std::generate(v95.begin(), v95.end(), std::ref(f32rng));
+  std::generate(v96.begin(), v96.end(), std::ref(f32rng));
+  std::generate(v97.begin(), v97.end(), std::ref(f32rng));
+  std::generate(v98.begin(), v98.end(), std::ref(f32rng));
+  std::generate(v99.begin(), v99.end(), std::ref(f32rng));
+  std::generate(w100.begin(), w100.end(), std::ref(f32rng));
+  std::generate(w101.begin(), w101.end(), std::ref(f32rng));
+  std::generate(w102.begin(), w102.end(), std::ref(f32rng));
+  std::generate(w103.begin(), w103.end(), std::ref(f32rng));
+  std::fill(w104.begin(), w104.end(), 0.0f);
+  std::generate(w104.begin(), w104.end() - size_t(sparsity * w104.size()), std::ref(f32rng));
+  std::shuffle(w104.begin(), w104.end(), rng);
+  std::generate(w105.begin(), w105.end(), std::ref(f32rng));
+  std::fill(w106.begin(), w106.end(), 0.0f);
+  std::generate(w106.begin(), w106.end() - size_t(sparsity * w106.size()), std::ref(f32rng));
+  std::shuffle(w106.begin(), w106.end(), rng);
+  std::generate(w107.begin(), w107.end(), std::ref(f32rng));
+  std::fill(w108.begin(), w108.end(), 0.0f);
+  std::generate(w108.begin(), w108.end() - size_t(sparsity * w108.size()), std::ref(f32rng));
+  std::shuffle(w108.begin(), w108.end(), rng);
+  std::generate(w109.begin(), w109.end(), std::ref(f32rng));
+  std::fill(w110.begin(), w110.end(), 0.0f);
+  std::generate(w110.begin(), w110.end() - size_t(sparsity * w110.size()), std::ref(f32rng));
+  std::shuffle(w110.begin(), w110.end(), rng);
+  std::generate(w111.begin(), w111.end(), std::ref(f32rng));
+  std::generate(w112.begin(), w112.end(), std::ref(f32rng));
+  std::generate(w113.begin(), w113.end(), std::ref(f32rng));
+  std::fill(w114.begin(), w114.end(), 0.0f);
+  std::generate(w114.begin(), w114.end() - size_t(sparsity * w114.size()), std::ref(f32rng));
+  std::shuffle(w114.begin(), w114.end(), rng);
+  std::generate(w115.begin(), w115.end(), std::ref(f32rng));
+  std::fill(w116.begin(), w116.end(), 0.0f);
+  std::generate(w116.begin(), w116.end() - size_t(sparsity * w116.size()), std::ref(f32rng));
+  std::shuffle(w116.begin(), w116.end(), rng);
+  std::generate(w117.begin(), w117.end(), std::ref(f32rng));
+  std::generate(w118.begin(), w118.end(), std::ref(f32rng));
+  std::generate(w119.begin(), w119.end(), std::ref(f32rng));
+  std::fill(w120.begin(), w120.end(), 0.0f);
+  std::generate(w120.begin(), w120.end() - size_t(sparsity * w120.size()), std::ref(f32rng));
+  std::shuffle(w120.begin(), w120.end(), rng);
+  std::generate(w121.begin(), w121.end(), std::ref(f32rng));
+  std::fill(w122.begin(), w122.end(), 0.0f);
+  std::generate(w122.begin(), w122.end() - size_t(sparsity * w122.size()), std::ref(f32rng));
+  std::shuffle(w122.begin(), w122.end(), rng);
+  std::generate(w123.begin(), w123.end(), std::ref(f32rng));
+  std::generate(w124.begin(), w124.end(), std::ref(f32rng));
+  std::generate(w125.begin(), w125.end(), std::ref(f32rng));
+  std::fill(w126.begin(), w126.end(), 0.0f);
+  std::generate(w126.begin(), w126.end() - size_t(sparsity * w126.size()), std::ref(f32rng));
+  std::shuffle(w126.begin(), w126.end(), rng);
+  std::generate(w127.begin(), w127.end(), std::ref(f32rng));
+  std::fill(w128.begin(), w128.end(), 0.0f);
+  std::generate(w128.begin(), w128.end() - size_t(sparsity * w128.size()), std::ref(f32rng));
+  std::shuffle(w128.begin(), w128.end(), rng);
+  std::generate(w129.begin(), w129.end(), std::ref(f32rng));
+  std::fill(w130.begin(), w130.end(), 0.0f);
+  std::generate(w130.begin(), w130.end() - size_t(sparsity * w130.size()), std::ref(f32rng));
+  std::shuffle(w130.begin(), w130.end(), rng);
+  std::generate(w131.begin(), w131.end(), std::ref(f32rng));
+  std::fill(w132.begin(), w132.end(), 0.0f);
+  std::generate(w132.begin(), w132.end() - size_t(sparsity * w132.size()), std::ref(f32rng));
+  std::shuffle(w132.begin(), w132.end(), rng);
+  std::generate(w133.begin(), w133.end(), std::ref(f32rng));
+  std::generate(w134.begin(), w134.end(), std::ref(f32rng));
+  std::generate(w135.begin(), w135.end(), std::ref(f32rng));
+  std::fill(w136.begin(), w136.end(), 0.0f);
+  std::generate(w136.begin(), w136.end() - size_t(sparsity * w136.size()), std::ref(f32rng));
+  std::shuffle(w136.begin(), w136.end(), rng);
+  std::generate(w137.begin(), w137.end(), std::ref(f32rng));
+  std::fill(w138.begin(), w138.end(), 0.0f);
+  std::generate(w138.begin(), w138.end() - size_t(sparsity * w138.size()), std::ref(f32rng));
+  std::shuffle(w138.begin(), w138.end(), rng);
+  std::generate(w139.begin(), w139.end(), std::ref(f32rng));
+  std::fill(w140.begin(), w140.end(), 0.0f);
+  std::generate(w140.begin(), w140.end() - size_t(sparsity * w140.size()), std::ref(f32rng));
+  std::shuffle(w140.begin(), w140.end(), rng);
+  std::generate(w141.begin(), w141.end(), std::ref(f32rng));
+  std::fill(w142.begin(), w142.end(), 0.0f);
+  std::generate(w142.begin(), w142.end() - size_t(sparsity * w142.size()), std::ref(f32rng));
+  std::shuffle(w142.begin(), w142.end(), rng);
+  std::generate(w143.begin(), w143.end(), std::ref(f32rng));
+  std::generate(w144.begin(), w144.end(), std::ref(f32rng));
+  std::generate(w145.begin(), w145.end(), std::ref(f32rng));
+  std::fill(w146.begin(), w146.end(), 0.0f);
+  std::generate(w146.begin(), w146.end() - size_t(sparsity * w146.size()), std::ref(f32rng));
+  std::shuffle(w146.begin(), w146.end(), rng);
+  std::generate(w147.begin(), w147.end(), std::ref(f32rng));
+  std::fill(w148.begin(), w148.end(), 0.0f);
+  std::generate(w148.begin(), w148.end() - size_t(sparsity * w148.size()), std::ref(f32rng));
+  std::shuffle(w148.begin(), w148.end(), rng);
+  std::generate(w149.begin(), w149.end(), std::ref(f32rng));
+  std::fill(w150.begin(), w150.end(), 0.0f);
+  std::generate(w150.begin(), w150.end() - size_t(sparsity * w150.size()), std::ref(f32rng));
+  std::shuffle(w150.begin(), w150.end(), rng);
+  std::generate(w151.begin(), w151.end(), std::ref(f32rng));
+  std::fill(w152.begin(), w152.end(), 0.0f);
+  std::generate(w152.begin(), w152.end() - size_t(sparsity * w152.size()), std::ref(f32rng));
+  std::shuffle(w152.begin(), w152.end(), rng);
+  std::generate(w153.begin(), w153.end(), std::ref(f32rng));
+  std::generate(w154.begin(), w154.end(), std::ref(f32rng));
+  std::generate(w155.begin(), w155.end(), std::ref(f32rng));
+  std::fill(w156.begin(), w156.end(), 0.0f);
+  std::generate(w156.begin(), w156.end() - size_t(sparsity * w156.size()), std::ref(f32rng));
+  std::shuffle(w156.begin(), w156.end(), rng);
+  std::generate(w157.begin(), w157.end(), std::ref(f32rng));
+  std::fill(w158.begin(), w158.end(), 0.0f);
+  std::generate(w158.begin(), w158.end() - size_t(sparsity * w158.size()), std::ref(f32rng));
+  std::shuffle(w158.begin(), w158.end(), rng);
+  std::generate(w159.begin(), w159.end(), std::ref(f32rng));
+  std::fill(w160.begin(), w160.end(), 0.0f);
+  std::generate(w160.begin(), w160.end() - size_t(sparsity * w160.size()), std::ref(f32rng));
+  std::shuffle(w160.begin(), w160.end(), rng);
+  std::generate(w161.begin(), w161.end(), std::ref(f32rng));
+  std::fill(w162.begin(), w162.end(), 0.0f);
+  std::generate(w162.begin(), w162.end() - size_t(sparsity * w162.size()), std::ref(f32rng));
+  std::shuffle(w162.begin(), w162.end(), rng);
+  std::generate(w163.begin(), w163.end(), std::ref(f32rng));
+  std::generate(w164.begin(), w164.end(), std::ref(f32rng));
+  std::generate(w165.begin(), w165.end(), std::ref(f32rng));
+  std::fill(w166.begin(), w166.end(), 0.0f);
+  std::generate(w166.begin(), w166.end() - size_t(sparsity * w166.size()), std::ref(f32rng));
+  std::shuffle(w166.begin(), w166.end(), rng);
+  std::generate(w167.begin(), w167.end(), std::ref(f32rng));
+  std::fill(w168.begin(), w168.end(), 0.0f);
+  std::generate(w168.begin(), w168.end() - size_t(sparsity * w168.size()), std::ref(f32rng));
+  std::shuffle(w168.begin(), w168.end(), rng);
+  std::generate(w169.begin(), w169.end(), std::ref(f32rng));
+  std::fill(w170.begin(), w170.end(), 0.0f);
+  std::generate(w170.begin(), w170.end() - size_t(sparsity * w170.size()), std::ref(f32rng));
+  std::shuffle(w170.begin(), w170.end(), rng);
+  std::generate(w171.begin(), w171.end(), std::ref(f32rng));
+  std::fill(w172.begin(), w172.end(), 0.0f);
+  std::generate(w172.begin(), w172.end() - size_t(sparsity * w172.size()), std::ref(f32rng));
+  std::shuffle(w172.begin(), w172.end(), rng);
+  std::generate(w173.begin(), w173.end(), std::ref(f32rng));
+  std::generate(w174.begin(), w174.end(), std::ref(f32rng));
+  std::generate(w175.begin(), w175.end(), std::ref(f32rng));
+  std::fill(w176.begin(), w176.end(), 0.0f);
+  std::generate(w176.begin(), w176.end() - size_t(sparsity * w176.size()), std::ref(f32rng));
+  std::shuffle(w176.begin(), w176.end(), rng);
+  std::generate(w177.begin(), w177.end(), std::ref(f32rng));
+  std::fill(w178.begin(), w178.end(), 0.0f);
+  std::generate(w178.begin(), w178.end() - size_t(sparsity * w178.size()), std::ref(f32rng));
+  std::shuffle(w178.begin(), w178.end(), rng);
+  std::generate(w179.begin(), w179.end(), std::ref(f32rng));
+  std::fill(w180.begin(), w180.end(), 0.0f);
+  std::generate(w180.begin(), w180.end() - size_t(sparsity * w180.size()), std::ref(f32rng));
+  std::shuffle(w180.begin(), w180.end(), rng);
+  std::generate(w181.begin(), w181.end(), std::ref(f32rng));
+  std::fill(w182.begin(), w182.end(), 0.0f);
+  std::generate(w182.begin(), w182.end() - size_t(sparsity * w182.size()), std::ref(f32rng));
+  std::shuffle(w182.begin(), w182.end(), rng);
+  std::generate(w183.begin(), w183.end(), std::ref(f32rng));
+  std::generate(w184.begin(), w184.end(), std::ref(f32rng));
+  std::generate(w185.begin(), w185.end(), std::ref(f32rng));
+  std::fill(w186.begin(), w186.end(), 0.0f);
+  std::generate(w186.begin(), w186.end() - size_t(sparsity * w186.size()), std::ref(f32rng));
+  std::shuffle(w186.begin(), w186.end(), rng);
+  std::generate(w187.begin(), w187.end(), std::ref(f32rng));
+  std::fill(w188.begin(), w188.end(), 0.0f);
+  std::generate(w188.begin(), w188.end() - size_t(sparsity * w188.size()), std::ref(f32rng));
+  std::shuffle(w188.begin(), w188.end(), rng);
+  std::generate(w189.begin(), w189.end(), std::ref(f32rng));
+  std::fill(w190.begin(), w190.end(), 0.0f);
+  std::generate(w190.begin(), w190.end() - size_t(sparsity * w190.size()), std::ref(f32rng));
+  std::shuffle(w190.begin(), w190.end(), rng);
+  std::generate(w191.begin(), w191.end(), std::ref(f32rng));
+  std::fill(w192.begin(), w192.end(), 0.0f);
+  std::generate(w192.begin(), w192.end() - size_t(sparsity * w192.size()), std::ref(f32rng));
+  std::shuffle(w192.begin(), w192.end(), rng);
+  std::generate(w193.begin(), w193.end(), std::ref(f32rng));
+  std::generate(w194.begin(), w194.end(), std::ref(f32rng));
+  std::generate(w195.begin(), w195.end(), std::ref(f32rng));
+  std::fill(w196.begin(), w196.end(), 0.0f);
+  std::generate(w196.begin(), w196.end() - size_t(sparsity * w196.size()), std::ref(f32rng));
+  std::shuffle(w196.begin(), w196.end(), rng);
+  std::generate(w197.begin(), w197.end(), std::ref(f32rng));
+  std::fill(w198.begin(), w198.end(), 0.0f);
+  std::generate(w198.begin(), w198.end() - size_t(sparsity * w198.size()), std::ref(f32rng));
+  std::shuffle(w198.begin(), w198.end(), rng);
+  std::generate(w199.begin(), w199.end(), std::ref(f32rng));
+  std::fill(w200.begin(), w200.end(), 0.0f);
+  std::generate(w200.begin(), w200.end() - size_t(sparsity * w200.size()), std::ref(f32rng));
+  std::shuffle(w200.begin(), w200.end(), rng);
+  std::generate(w201.begin(), w201.end(), std::ref(f32rng));
+  std::fill(w202.begin(), w202.end(), 0.0f);
+  std::generate(w202.begin(), w202.end() - size_t(sparsity * w202.size()), std::ref(f32rng));
+  std::shuffle(w202.begin(), w202.end(), rng);
+  std::generate(w203.begin(), w203.end(), std::ref(f32rng));
+  std::fill(w204.begin(), w204.end(), 0.0f);
+  std::generate(w204.begin(), w204.end() - size_t(sparsity * w204.size()), std::ref(f32rng));
+  std::shuffle(w204.begin(), w204.end(), rng);
+  std::generate(w205.begin(), w205.end(), std::ref(f32rng));
+  std::generate(w206.begin(), w206.end(), std::ref(f32rng));
+  std::generate(w207.begin(), w207.end(), std::ref(f32rng));
+
+  ExecutionPlan operators;
+  xnn_status status;
+
+  xnn_operator_t op0 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    2 /* subsampling height */, 2 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    3 /* input channels per group */,
+    16 /* output_channels_per_group */,
+    3 /* input pixel stride */,
+    16 /* output pixel stride */,
+    w100.data(), w101.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    XNN_FLAG_INPUT_NHWC /* flags */,
+    &op0);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #0" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op0, xnn_delete_operator);
+
+  xnn_operator_t op1 = nullptr;
+  status = xnn_create_hardswish_nc_f32(
+    16 /* channels */,
+    16 /* input stride */,
+    16 /* output stride */,
+    0 /* flags */,
+    &op1);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #1" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op1, xnn_delete_operator);
+
+  xnn_operator_t op2 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    2 /* subsampling height */, 2 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    16 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    16 /* input pixel stride */,
+    16 /* output pixel stride */,
+    w102.data(), w103.data(),
+    0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op2);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #2" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op2, xnn_delete_operator);
+
+  xnn_operator_t op3 = nullptr;
+  status = xnn_create_global_average_pooling_ncw_f32(
+    16 /* channels */,
+    -std::numeric_limits<float>::infinity(), std::numeric_limits<float>::infinity(),
+    0 /* flags */,
+    &op3);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #3" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op3, xnn_delete_operator);
+
+  xnn_operator_t op4 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    16 /* input channels per group */,
+    8 /* output_channels_per_group */,
+    16 /* input pixel stride */,
+    8 /* output pixel stride */,
+    w104.data(), w105.data(),
+    0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op4);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #4" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op4, xnn_delete_operator);
+
+  xnn_operator_t op5 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    8 /* input channels per group */,
+    16 /* output_channels_per_group */,
+    8 /* input pixel stride */,
+    16 /* output pixel stride */,
+    w106.data(), w107.data(),
+    0.0f /* output min */, +0x1.00014Fp+0 /* output max */,
+    0 /* flags */,
+    &op5);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #5" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op5, xnn_delete_operator);
+
+  xnn_operator_t op6 = nullptr;
+  status = xnn_create_multiply_nd_f32(
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op6);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #6" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op6, xnn_delete_operator);
+
+  xnn_operator_t op7 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    16 /* input channels per group */,
+    16 /* output_channels_per_group */,
+    16 /* input pixel stride */,
+    16 /* output pixel stride */,
+    w108.data(), w109.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op7);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #7" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op7, xnn_delete_operator);
+
+  xnn_operator_t op8 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    16 /* input channels per group */,
+    72 /* output_channels_per_group */,
+    16 /* input pixel stride */,
+    72 /* output pixel stride */,
+    w110.data(), w111.data(),
+    0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op8);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #8" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op8, xnn_delete_operator);
+
+  xnn_operator_t op9 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    2 /* subsampling height */, 2 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    72 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    72 /* input pixel stride */,
+    72 /* output pixel stride */,
+    w112.data(), w113.data(),
+    0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op9);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #9" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op9, xnn_delete_operator);
+
+  xnn_operator_t op10 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    72 /* input channels per group */,
+    24 /* output_channels_per_group */,
+    72 /* input pixel stride */,
+    24 /* output pixel stride */,
+    w114.data(), w115.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op10);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #10" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op10, xnn_delete_operator);
+
+  xnn_operator_t op11 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    24 /* input channels per group */,
+    88 /* output_channels_per_group */,
+    24 /* input pixel stride */,
+    88 /* output pixel stride */,
+    w116.data(), w117.data(),
+    0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op11);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #11" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op11, xnn_delete_operator);
+
+  xnn_operator_t op12 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    88 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    88 /* input pixel stride */,
+    88 /* output pixel stride */,
+    w118.data(), w119.data(),
+    0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op12);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #12" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op12, xnn_delete_operator);
+
+  xnn_operator_t op13 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    88 /* input channels per group */,
+    24 /* output_channels_per_group */,
+    88 /* input pixel stride */,
+    24 /* output pixel stride */,
+    w120.data(), w121.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op13);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #13" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op13, xnn_delete_operator);
+
+  xnn_operator_t op14 = nullptr;
+  status = xnn_create_add_nd_f32(
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op14);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #14" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op14, xnn_delete_operator);
+
+  xnn_operator_t op15 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    24 /* input channels per group */,
+    96 /* output_channels_per_group */,
+    24 /* input pixel stride */,
+    96 /* output pixel stride */,
+    w122.data(), w123.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op15);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #15" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op15, xnn_delete_operator);
+
+  xnn_operator_t op16 = nullptr;
+  status = xnn_create_hardswish_nc_f32(
+    96 /* channels */,
+    96 /* input stride */,
+    96 /* output stride */,
+    0 /* flags */,
+    &op16);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #16" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op16, xnn_delete_operator);
+
+  xnn_operator_t op17 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    2 /* top padding */, 2 /* right padding */,
+    2 /* bottom padding */, 2 /* left padding */,
+    5 /* kernel height */, 5 /* kernel width */,
+    2 /* subsampling height */, 2 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    96 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    96 /* input pixel stride */,
+    96 /* output pixel stride */,
+    w124.data(), w125.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op17);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #17" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op17, xnn_delete_operator);
+
+  xnn_operator_t op18 = nullptr;
+  status = xnn_create_hardswish_nc_f32(
+    96 /* channels */,
+    96 /* input stride */,
+    96 /* output stride */,
+    0 /* flags */,
+    &op18);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #18" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op18, xnn_delete_operator);
+
+  xnn_operator_t op19 = nullptr;
+  status = xnn_create_global_average_pooling_ncw_f32(
+    96 /* channels */,
+    -std::numeric_limits<float>::infinity(), std::numeric_limits<float>::infinity(),
+    0 /* flags */,
+    &op19);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #19" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op19, xnn_delete_operator);
+
+  xnn_operator_t op20 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    96 /* input channels per group */,
+    24 /* output_channels_per_group */,
+    96 /* input pixel stride */,
+    24 /* output pixel stride */,
+    w126.data(), w127.data(),
+    0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op20);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #20" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op20, xnn_delete_operator);
+
+  xnn_operator_t op21 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    24 /* input channels per group */,
+    96 /* output_channels_per_group */,
+    24 /* input pixel stride */,
+    96 /* output pixel stride */,
+    w128.data(), w129.data(),
+    0.0f /* output min */, +0x1.00014Fp+0 /* output max */,
+    0 /* flags */,
+    &op21);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #21" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op21, xnn_delete_operator);
+
+  xnn_operator_t op22 = nullptr;
+  status = xnn_create_multiply_nd_f32(
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op22);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #22" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op22, xnn_delete_operator);
+
+  xnn_operator_t op23 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    96 /* input channels per group */,
+    40 /* output_channels_per_group */,
+    96 /* input pixel stride */,
+    40 /* output pixel stride */,
+    w130.data(), w131.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op23);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #23" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op23, xnn_delete_operator);
+
+  xnn_operator_t op24 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    40 /* input channels per group */,
+    240 /* output_channels_per_group */,
+    40 /* input pixel stride */,
+    240 /* output pixel stride */,
+    w132.data(), w133.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op24);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #24" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op24, xnn_delete_operator);
+
+  xnn_operator_t op25 = nullptr;
+  status = xnn_create_hardswish_nc_f32(
+    240 /* channels */,
+    240 /* input stride */,
+    240 /* output stride */,
+    0 /* flags */,
+    &op25);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #25" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op25, xnn_delete_operator);
+
+  xnn_operator_t op26 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    2 /* top padding */, 2 /* right padding */,
+    2 /* bottom padding */, 2 /* left padding */,
+    5 /* kernel height */, 5 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    240 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    240 /* input pixel stride */,
+    240 /* output pixel stride */,
+    w134.data(), w135.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op26);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #26" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op26, xnn_delete_operator);
+
+  xnn_operator_t op27 = nullptr;
+  status = xnn_create_hardswish_nc_f32(
+    240 /* channels */,
+    240 /* input stride */,
+    240 /* output stride */,
+    0 /* flags */,
+    &op27);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #27" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op27, xnn_delete_operator);
+
+  xnn_operator_t op28 = nullptr;
+  status = xnn_create_global_average_pooling_ncw_f32(
+    240 /* channels */,
+    -std::numeric_limits<float>::infinity(), std::numeric_limits<float>::infinity(),
+    0 /* flags */,
+    &op28);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #28" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op28, xnn_delete_operator);
+
+  xnn_operator_t op29 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    240 /* input channels per group */,
+    64 /* output_channels_per_group */,
+    240 /* input pixel stride */,
+    64 /* output pixel stride */,
+    w136.data(), w137.data(),
+    0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op29);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #29" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op29, xnn_delete_operator);
+
+  xnn_operator_t op30 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    64 /* input channels per group */,
+    240 /* output_channels_per_group */,
+    64 /* input pixel stride */,
+    240 /* output pixel stride */,
+    w138.data(), w139.data(),
+    0.0f /* output min */, +0x1.00014Fp+0 /* output max */,
+    0 /* flags */,
+    &op30);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #30" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op30, xnn_delete_operator);
+
+  xnn_operator_t op31 = nullptr;
+  status = xnn_create_multiply_nd_f32(
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op31);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #31" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op31, xnn_delete_operator);
+
+  xnn_operator_t op32 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    240 /* input channels per group */,
+    40 /* output_channels_per_group */,
+    240 /* input pixel stride */,
+    40 /* output pixel stride */,
+    w140.data(), w141.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op32);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #32" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op32, xnn_delete_operator);
+
+  xnn_operator_t op33 = nullptr;
+  status = xnn_create_add_nd_f32(
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op33);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #33" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op33, xnn_delete_operator);
+
+  xnn_operator_t op34 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    40 /* input channels per group */,
+    240 /* output_channels_per_group */,
+    40 /* input pixel stride */,
+    240 /* output pixel stride */,
+    w142.data(), w143.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op34);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #34" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op34, xnn_delete_operator);
+
+  xnn_operator_t op35 = nullptr;
+  status = xnn_create_hardswish_nc_f32(
+    240 /* channels */,
+    240 /* input stride */,
+    240 /* output stride */,
+    0 /* flags */,
+    &op35);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #35" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op35, xnn_delete_operator);
+
+  xnn_operator_t op36 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    2 /* top padding */, 2 /* right padding */,
+    2 /* bottom padding */, 2 /* left padding */,
+    5 /* kernel height */, 5 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    240 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    240 /* input pixel stride */,
+    240 /* output pixel stride */,
+    w144.data(), w145.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op36);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #36" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op36, xnn_delete_operator);
+
+  xnn_operator_t op37 = nullptr;
+  status = xnn_create_hardswish_nc_f32(
+    240 /* channels */,
+    240 /* input stride */,
+    240 /* output stride */,
+    0 /* flags */,
+    &op37);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #37" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op37, xnn_delete_operator);
+
+  xnn_operator_t op38 = nullptr;
+  status = xnn_create_global_average_pooling_ncw_f32(
+    240 /* channels */,
+    -std::numeric_limits<float>::infinity(), std::numeric_limits<float>::infinity(),
+    0 /* flags */,
+    &op38);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #38" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op38, xnn_delete_operator);
+
+  xnn_operator_t op39 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    240 /* input channels per group */,
+    64 /* output_channels_per_group */,
+    240 /* input pixel stride */,
+    64 /* output pixel stride */,
+    w146.data(), w147.data(),
+    0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op39);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #39" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op39, xnn_delete_operator);
+
+  xnn_operator_t op40 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    64 /* input channels per group */,
+    240 /* output_channels_per_group */,
+    64 /* input pixel stride */,
+    240 /* output pixel stride */,
+    w148.data(), w149.data(),
+    0.0f /* output min */, +0x1.00014Fp+0 /* output max */,
+    0 /* flags */,
+    &op40);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #40" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op40, xnn_delete_operator);
+
+  xnn_operator_t op41 = nullptr;
+  status = xnn_create_multiply_nd_f32(
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op41);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #41" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op41, xnn_delete_operator);
+
+  xnn_operator_t op42 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    240 /* input channels per group */,
+    40 /* output_channels_per_group */,
+    240 /* input pixel stride */,
+    40 /* output pixel stride */,
+    w150.data(), w151.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op42);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #42" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op42, xnn_delete_operator);
+
+  xnn_operator_t op43 = nullptr;
+  status = xnn_create_add_nd_f32(
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op43);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #43" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op43, xnn_delete_operator);
+
+  xnn_operator_t op44 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    40 /* input channels per group */,
+    120 /* output_channels_per_group */,
+    40 /* input pixel stride */,
+    120 /* output pixel stride */,
+    w152.data(), w153.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op44);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #44" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op44, xnn_delete_operator);
+
+  xnn_operator_t op45 = nullptr;
+  status = xnn_create_hardswish_nc_f32(
+    120 /* channels */,
+    120 /* input stride */,
+    120 /* output stride */,
+    0 /* flags */,
+    &op45);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #45" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op45, xnn_delete_operator);
+
+  xnn_operator_t op46 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    2 /* top padding */, 2 /* right padding */,
+    2 /* bottom padding */, 2 /* left padding */,
+    5 /* kernel height */, 5 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    120 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    120 /* input pixel stride */,
+    120 /* output pixel stride */,
+    w154.data(), w155.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op46);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #46" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op46, xnn_delete_operator);
+
+  xnn_operator_t op47 = nullptr;
+  status = xnn_create_hardswish_nc_f32(
+    120 /* channels */,
+    120 /* input stride */,
+    120 /* output stride */,
+    0 /* flags */,
+    &op47);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #47" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op47, xnn_delete_operator);
+
+  xnn_operator_t op48 = nullptr;
+  status = xnn_create_global_average_pooling_ncw_f32(
+    120 /* channels */,
+    -std::numeric_limits<float>::infinity(), std::numeric_limits<float>::infinity(),
+    0 /* flags */,
+    &op48);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #48" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op48, xnn_delete_operator);
+
+  xnn_operator_t op49 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    120 /* input channels per group */,
+    32 /* output_channels_per_group */,
+    120 /* input pixel stride */,
+    32 /* output pixel stride */,
+    w156.data(), w157.data(),
+    0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op49);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #49" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op49, xnn_delete_operator);
+
+  xnn_operator_t op50 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    32 /* input channels per group */,
+    120 /* output_channels_per_group */,
+    32 /* input pixel stride */,
+    120 /* output pixel stride */,
+    w158.data(), w159.data(),
+    0.0f /* output min */, +0x1.00014Fp+0 /* output max */,
+    0 /* flags */,
+    &op50);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #50" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op50, xnn_delete_operator);
+
+  xnn_operator_t op51 = nullptr;
+  status = xnn_create_multiply_nd_f32(
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op51);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #51" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op51, xnn_delete_operator);
+
+  xnn_operator_t op52 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    120 /* input channels per group */,
+    48 /* output_channels_per_group */,
+    120 /* input pixel stride */,
+    48 /* output pixel stride */,
+    w160.data(), w161.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op52);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #52" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op52, xnn_delete_operator);
+
+  xnn_operator_t op53 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    48 /* input channels per group */,
+    144 /* output_channels_per_group */,
+    48 /* input pixel stride */,
+    144 /* output pixel stride */,
+    w162.data(), w163.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op53);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #53" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op53, xnn_delete_operator);
+
+  xnn_operator_t op54 = nullptr;
+  status = xnn_create_hardswish_nc_f32(
+    144 /* channels */,
+    144 /* input stride */,
+    144 /* output stride */,
+    0 /* flags */,
+    &op54);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #54" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op54, xnn_delete_operator);
+
+  xnn_operator_t op55 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    2 /* top padding */, 2 /* right padding */,
+    2 /* bottom padding */, 2 /* left padding */,
+    5 /* kernel height */, 5 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    144 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    144 /* input pixel stride */,
+    144 /* output pixel stride */,
+    w164.data(), w165.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op55);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #55" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op55, xnn_delete_operator);
+
+  xnn_operator_t op56 = nullptr;
+  status = xnn_create_hardswish_nc_f32(
+    144 /* channels */,
+    144 /* input stride */,
+    144 /* output stride */,
+    0 /* flags */,
+    &op56);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #56" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op56, xnn_delete_operator);
+
+  xnn_operator_t op57 = nullptr;
+  status = xnn_create_global_average_pooling_ncw_f32(
+    144 /* channels */,
+    -std::numeric_limits<float>::infinity(), std::numeric_limits<float>::infinity(),
+    0 /* flags */,
+    &op57);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #57" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op57, xnn_delete_operator);
+
+  xnn_operator_t op58 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    144 /* input channels per group */,
+    40 /* output_channels_per_group */,
+    144 /* input pixel stride */,
+    40 /* output pixel stride */,
+    w166.data(), w167.data(),
+    0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op58);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #58" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op58, xnn_delete_operator);
+
+  xnn_operator_t op59 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    40 /* input channels per group */,
+    144 /* output_channels_per_group */,
+    40 /* input pixel stride */,
+    144 /* output pixel stride */,
+    w168.data(), w169.data(),
+    0.0f /* output min */, +0x1.00014Fp+0 /* output max */,
+    0 /* flags */,
+    &op59);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #59" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op59, xnn_delete_operator);
+
+  xnn_operator_t op60 = nullptr;
+  status = xnn_create_multiply_nd_f32(
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op60);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #60" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op60, xnn_delete_operator);
+
+  xnn_operator_t op61 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    144 /* input channels per group */,
+    48 /* output_channels_per_group */,
+    144 /* input pixel stride */,
+    48 /* output pixel stride */,
+    w170.data(), w171.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op61);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #61" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op61, xnn_delete_operator);
+
+  xnn_operator_t op62 = nullptr;
+  status = xnn_create_add_nd_f32(
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op62);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #62" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op62, xnn_delete_operator);
+
+  xnn_operator_t op63 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    48 /* input channels per group */,
+    288 /* output_channels_per_group */,
+    48 /* input pixel stride */,
+    288 /* output pixel stride */,
+    w172.data(), w173.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op63);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #63" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op63, xnn_delete_operator);
+
+  xnn_operator_t op64 = nullptr;
+  status = xnn_create_hardswish_nc_f32(
+    288 /* channels */,
+    288 /* input stride */,
+    288 /* output stride */,
+    0 /* flags */,
+    &op64);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #64" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op64, xnn_delete_operator);
+
+  xnn_operator_t op65 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    2 /* top padding */, 2 /* right padding */,
+    2 /* bottom padding */, 2 /* left padding */,
+    5 /* kernel height */, 5 /* kernel width */,
+    2 /* subsampling height */, 2 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    288 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    288 /* input pixel stride */,
+    288 /* output pixel stride */,
+    w174.data(), w175.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op65);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #65" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op65, xnn_delete_operator);
+
+  xnn_operator_t op66 = nullptr;
+  status = xnn_create_hardswish_nc_f32(
+    288 /* channels */,
+    288 /* input stride */,
+    288 /* output stride */,
+    0 /* flags */,
+    &op66);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #66" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op66, xnn_delete_operator);
+
+  xnn_operator_t op67 = nullptr;
+  status = xnn_create_global_average_pooling_ncw_f32(
+    288 /* channels */,
+    -std::numeric_limits<float>::infinity(), std::numeric_limits<float>::infinity(),
+    0 /* flags */,
+    &op67);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #67" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op67, xnn_delete_operator);
+
+  xnn_operator_t op68 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    288 /* input channels per group */,
+    72 /* output_channels_per_group */,
+    288 /* input pixel stride */,
+    72 /* output pixel stride */,
+    w176.data(), w177.data(),
+    0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op68);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #68" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op68, xnn_delete_operator);
+
+  xnn_operator_t op69 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    72 /* input channels per group */,
+    288 /* output_channels_per_group */,
+    72 /* input pixel stride */,
+    288 /* output pixel stride */,
+    w178.data(), w179.data(),
+    0.0f /* output min */, +0x1.00014Fp+0 /* output max */,
+    0 /* flags */,
+    &op69);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #69" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op69, xnn_delete_operator);
+
+  xnn_operator_t op70 = nullptr;
+  status = xnn_create_multiply_nd_f32(
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op70);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #70" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op70, xnn_delete_operator);
+
+  xnn_operator_t op71 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    288 /* input channels per group */,
+    96 /* output_channels_per_group */,
+    288 /* input pixel stride */,
+    96 /* output pixel stride */,
+    w180.data(), w181.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op71);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #71" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op71, xnn_delete_operator);
+
+  xnn_operator_t op72 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    96 /* input channels per group */,
+    576 /* output_channels_per_group */,
+    96 /* input pixel stride */,
+    576 /* output pixel stride */,
+    w182.data(), w183.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op72);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #72" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op72, xnn_delete_operator);
+
+  xnn_operator_t op73 = nullptr;
+  status = xnn_create_hardswish_nc_f32(
+    576 /* channels */,
+    576 /* input stride */,
+    576 /* output stride */,
+    0 /* flags */,
+    &op73);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #73" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op73, xnn_delete_operator);
+
+  xnn_operator_t op74 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    2 /* top padding */, 2 /* right padding */,
+    2 /* bottom padding */, 2 /* left padding */,
+    5 /* kernel height */, 5 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    576 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    576 /* input pixel stride */,
+    576 /* output pixel stride */,
+    w184.data(), w185.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op74);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #74" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op74, xnn_delete_operator);
+
+  xnn_operator_t op75 = nullptr;
+  status = xnn_create_hardswish_nc_f32(
+    576 /* channels */,
+    576 /* input stride */,
+    576 /* output stride */,
+    0 /* flags */,
+    &op75);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #75" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op75, xnn_delete_operator);
+
+  xnn_operator_t op76 = nullptr;
+  status = xnn_create_global_average_pooling_ncw_f32(
+    576 /* channels */,
+    -std::numeric_limits<float>::infinity(), std::numeric_limits<float>::infinity(),
+    0 /* flags */,
+    &op76);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #76" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op76, xnn_delete_operator);
+
+  xnn_operator_t op77 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    576 /* input channels per group */,
+    144 /* output_channels_per_group */,
+    576 /* input pixel stride */,
+    144 /* output pixel stride */,
+    w186.data(), w187.data(),
+    0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op77);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #77" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op77, xnn_delete_operator);
+
+  xnn_operator_t op78 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    144 /* input channels per group */,
+    576 /* output_channels_per_group */,
+    144 /* input pixel stride */,
+    576 /* output pixel stride */,
+    w188.data(), w189.data(),
+    0.0f /* output min */, +0x1.00014Fp+0 /* output max */,
+    0 /* flags */,
+    &op78);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #78" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op78, xnn_delete_operator);
+
+  xnn_operator_t op79 = nullptr;
+  status = xnn_create_multiply_nd_f32(
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op79);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #79" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op79, xnn_delete_operator);
+
+  xnn_operator_t op80 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    576 /* input channels per group */,
+    96 /* output_channels_per_group */,
+    576 /* input pixel stride */,
+    96 /* output pixel stride */,
+    w190.data(), w191.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op80);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #80" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op80, xnn_delete_operator);
+
+  xnn_operator_t op81 = nullptr;
+  status = xnn_create_add_nd_f32(
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op81);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #81" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op81, xnn_delete_operator);
+
+  xnn_operator_t op82 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    96 /* input channels per group */,
+    576 /* output_channels_per_group */,
+    96 /* input pixel stride */,
+    576 /* output pixel stride */,
+    w192.data(), w193.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op82);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #82" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op82, xnn_delete_operator);
+
+  xnn_operator_t op83 = nullptr;
+  status = xnn_create_hardswish_nc_f32(
+    576 /* channels */,
+    576 /* input stride */,
+    576 /* output stride */,
+    0 /* flags */,
+    &op83);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #83" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op83, xnn_delete_operator);
+
+  xnn_operator_t op84 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    2 /* top padding */, 2 /* right padding */,
+    2 /* bottom padding */, 2 /* left padding */,
+    5 /* kernel height */, 5 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    576 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    576 /* input pixel stride */,
+    576 /* output pixel stride */,
+    w194.data(), w195.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op84);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #84" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op84, xnn_delete_operator);
+
+  xnn_operator_t op85 = nullptr;
+  status = xnn_create_hardswish_nc_f32(
+    576 /* channels */,
+    576 /* input stride */,
+    576 /* output stride */,
+    0 /* flags */,
+    &op85);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #85" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op85, xnn_delete_operator);
+
+  xnn_operator_t op86 = nullptr;
+  status = xnn_create_global_average_pooling_ncw_f32(
+    576 /* channels */,
+    -std::numeric_limits<float>::infinity(), std::numeric_limits<float>::infinity(),
+    0 /* flags */,
+    &op86);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #86" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op86, xnn_delete_operator);
+
+  xnn_operator_t op87 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    576 /* input channels per group */,
+    144 /* output_channels_per_group */,
+    576 /* input pixel stride */,
+    144 /* output pixel stride */,
+    w196.data(), w197.data(),
+    0.0f /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op87);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #87" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op87, xnn_delete_operator);
+
+  xnn_operator_t op88 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    144 /* input channels per group */,
+    576 /* output_channels_per_group */,
+    144 /* input pixel stride */,
+    576 /* output pixel stride */,
+    w198.data(), w199.data(),
+    0.0f /* output min */, +0x1.00014Fp+0 /* output max */,
+    0 /* flags */,
+    &op88);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #88" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op88, xnn_delete_operator);
+
+  xnn_operator_t op89 = nullptr;
+  status = xnn_create_multiply_nd_f32(
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op89);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #89" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op89, xnn_delete_operator);
+
+  xnn_operator_t op90 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    576 /* input channels per group */,
+    96 /* output_channels_per_group */,
+    576 /* input pixel stride */,
+    96 /* output pixel stride */,
+    w200.data(), w201.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op90);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #90" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op90, xnn_delete_operator);
+
+  xnn_operator_t op91 = nullptr;
+  status = xnn_create_add_nd_f32(
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op91);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #91" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op91, xnn_delete_operator);
+
+  xnn_operator_t op92 = nullptr;
+  status = xnn_create_convolution2d_nchw_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    96 /* input channels per group */,
+    576 /* output_channels_per_group */,
+    96 /* input pixel stride */,
+    576 /* output pixel stride */,
+    w202.data(), w203.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op92);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #92" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op92, xnn_delete_operator);
+
+  xnn_operator_t op93 = nullptr;
+  status = xnn_create_hardswish_nc_f32(
+    576 /* channels */,
+    576 /* input stride */,
+    576 /* output stride */,
+    0 /* flags */,
+    &op93);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #93" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op93, xnn_delete_operator);
+
+  xnn_operator_t op94 = nullptr;
+  status = xnn_create_global_average_pooling_ncw_f32(
+    576 /* channels */,
+    -std::numeric_limits<float>::infinity(), std::numeric_limits<float>::infinity(),
+    0 /* flags */,
+    &op94);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #94" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op94, xnn_delete_operator);
+
+  xnn_operator_t op95 = nullptr;
+  status = xnn_create_convolution2d_nhwc_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    576 /* input channels per group */,
+    1024 /* output_channels_per_group */,
+    576 /* input pixel stride */,
+    1024 /* output pixel stride */,
+    w204.data(), w205.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op95);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #95" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op95, xnn_delete_operator);
+
+  xnn_operator_t op96 = nullptr;
+  status = xnn_create_hardswish_nc_f32(
+    1024 /* channels */,
+    1024 /* input stride */,
+    1024 /* output stride */,
+    0 /* flags */,
+    &op96);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #96" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op96, xnn_delete_operator);
+
+  xnn_operator_t op97 = nullptr;
+  status = xnn_create_global_average_pooling_nwc_f32(
+    1024 /* channels */, 1024 /* input stride */, 1024 /* output stride */,
+    -std::numeric_limits<float>::infinity(), std::numeric_limits<float>::infinity(),
+    0 /* flags */,
+    &op97);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #97" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op97, xnn_delete_operator);
+
+  xnn_operator_t op98 = nullptr;
+  status = xnn_create_convolution2d_nhwc_f32(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    1024 /* input channels per group */,
+    1001 /* output_channels_per_group */,
+    1024 /* input pixel stride */,
+    1001 /* output pixel stride */,
+    w206.data(), w207.data(),
+    -std::numeric_limits<float>::infinity() /* output min */, std::numeric_limits<float>::infinity() /* output max */,
+    0 /* flags */,
+    &op98);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #98" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op98, xnn_delete_operator);
+
+
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op0,
+    1 /* batch size */, 224 /* input height */, 224 /* input width */,
+    v0.data() /* input */, v1.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #0" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_hardswish_nc_f32(
+    op1,
+    12544 /* batch size */,
+    v1.data() /* input */, v2.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #1" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op2,
+    1 /* batch size */, 112 /* input height */, 112 /* input width */,
+    v2.data() /* input */, v3.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #2" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_global_average_pooling_ncw_f32(
+    op3,
+    1 /* batch size */, 3136 /* width */,
+    v3.data() /* input */, v4.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #3" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op4,
+    1 /* batch size */, 1 /* input height */, 1 /* input width */,
+    v4.data() /* input */, v5.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #4" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op5,
+    1 /* batch size */, 1 /* input height */, 1 /* input width */,
+    v5.data() /* input */, v6.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #5" << std::endl;
+    return ExecutionPlan();
+  }
+
+  {
+    const size_t a_shape[] = { 1, 16, 56, 56 };
+    const size_t b_shape[] = { 1, 16, 1, 1 };
+    status = xnn_setup_multiply_nd_f32(
+      op6,
+      4, a_shape, 4, b_shape,
+      v3.data() /* a */, v6.data() /* b */, v7.data() /* output */,
+      threadpool /* threadpool */);
+  }
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #6" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op7,
+    1 /* batch size */, 56 /* input height */, 56 /* input width */,
+    v7.data() /* input */, v8.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #7" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op8,
+    1 /* batch size */, 56 /* input height */, 56 /* input width */,
+    v8.data() /* input */, v9.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #8" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op9,
+    1 /* batch size */, 56 /* input height */, 56 /* input width */,
+    v9.data() /* input */, v10.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #9" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op10,
+    1 /* batch size */, 28 /* input height */, 28 /* input width */,
+    v10.data() /* input */, v11.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #10" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op11,
+    1 /* batch size */, 28 /* input height */, 28 /* input width */,
+    v11.data() /* input */, v12.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #11" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op12,
+    1 /* batch size */, 28 /* input height */, 28 /* input width */,
+    v12.data() /* input */, v13.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #12" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op13,
+    1 /* batch size */, 28 /* input height */, 28 /* input width */,
+    v13.data() /* input */, v14.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #13" << std::endl;
+    return ExecutionPlan();
+  }
+
+  {
+    const size_t a_shape[] = { 1, 24, 28, 28 };
+    const size_t b_shape[] = { 1, 24, 28, 28 };
+    status = xnn_setup_add_nd_f32(
+      op14,
+      4, a_shape, 4, b_shape,
+      v14.data() /* a */, v11.data() /* b */, v15.data() /* output */,
+      threadpool /* threadpool */);
+  }
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #14" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op15,
+    1 /* batch size */, 28 /* input height */, 28 /* input width */,
+    v15.data() /* input */, v16.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #15" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_hardswish_nc_f32(
+    op16,
+    784 /* batch size */,
+    v16.data() /* input */, v17.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #16" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op17,
+    1 /* batch size */, 28 /* input height */, 28 /* input width */,
+    v17.data() /* input */, v18.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #17" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_hardswish_nc_f32(
+    op18,
+    196 /* batch size */,
+    v18.data() /* input */, v19.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #18" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_global_average_pooling_ncw_f32(
+    op19,
+    1 /* batch size */, 196 /* width */,
+    v19.data() /* input */, v20.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #19" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op20,
+    1 /* batch size */, 1 /* input height */, 1 /* input width */,
+    v20.data() /* input */, v21.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #20" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op21,
+    1 /* batch size */, 1 /* input height */, 1 /* input width */,
+    v21.data() /* input */, v22.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #21" << std::endl;
+    return ExecutionPlan();
+  }
+
+  {
+    const size_t a_shape[] = { 1, 96, 14, 14 };
+    const size_t b_shape[] = { 1, 96, 1, 1 };
+    status = xnn_setup_multiply_nd_f32(
+      op22,
+      4, a_shape, 4, b_shape,
+      v19.data() /* a */, v22.data() /* b */, v23.data() /* output */,
+      threadpool /* threadpool */);
+  }
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #22" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op23,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v23.data() /* input */, v24.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #23" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op24,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v24.data() /* input */, v25.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #24" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_hardswish_nc_f32(
+    op25,
+    196 /* batch size */,
+    v25.data() /* input */, v26.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #25" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op26,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v26.data() /* input */, v27.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #26" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_hardswish_nc_f32(
+    op27,
+    196 /* batch size */,
+    v27.data() /* input */, v28.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #27" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_global_average_pooling_ncw_f32(
+    op28,
+    1 /* batch size */, 196 /* width */,
+    v28.data() /* input */, v29.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #28" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op29,
+    1 /* batch size */, 1 /* input height */, 1 /* input width */,
+    v29.data() /* input */, v30.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #29" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op30,
+    1 /* batch size */, 1 /* input height */, 1 /* input width */,
+    v30.data() /* input */, v31.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #30" << std::endl;
+    return ExecutionPlan();
+  }
+
+  {
+    const size_t a_shape[] = { 1, 240, 14, 14 };
+    const size_t b_shape[] = { 1, 240, 1, 1 };
+    status = xnn_setup_multiply_nd_f32(
+      op31,
+      4, a_shape, 4, b_shape,
+      v28.data() /* a */, v31.data() /* b */, v32.data() /* output */,
+      threadpool /* threadpool */);
+  }
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #31" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op32,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v32.data() /* input */, v33.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #32" << std::endl;
+    return ExecutionPlan();
+  }
+
+  {
+    const size_t a_shape[] = { 1, 40, 14, 14 };
+    const size_t b_shape[] = { 1, 40, 14, 14 };
+    status = xnn_setup_add_nd_f32(
+      op33,
+      4, a_shape, 4, b_shape,
+      v33.data() /* a */, v24.data() /* b */, v34.data() /* output */,
+      threadpool /* threadpool */);
+  }
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #33" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op34,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v34.data() /* input */, v35.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #34" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_hardswish_nc_f32(
+    op35,
+    196 /* batch size */,
+    v35.data() /* input */, v36.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #35" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op36,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v36.data() /* input */, v37.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #36" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_hardswish_nc_f32(
+    op37,
+    196 /* batch size */,
+    v37.data() /* input */, v38.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #37" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_global_average_pooling_ncw_f32(
+    op38,
+    1 /* batch size */, 196 /* width */,
+    v38.data() /* input */, v39.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #38" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op39,
+    1 /* batch size */, 1 /* input height */, 1 /* input width */,
+    v39.data() /* input */, v40.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #39" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op40,
+    1 /* batch size */, 1 /* input height */, 1 /* input width */,
+    v40.data() /* input */, v41.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #40" << std::endl;
+    return ExecutionPlan();
+  }
+
+  {
+    const size_t a_shape[] = { 1, 240, 14, 14 };
+    const size_t b_shape[] = { 1, 240, 1, 1 };
+    status = xnn_setup_multiply_nd_f32(
+      op41,
+      4, a_shape, 4, b_shape,
+      v38.data() /* a */, v41.data() /* b */, v42.data() /* output */,
+      threadpool /* threadpool */);
+  }
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #41" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op42,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v42.data() /* input */, v43.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #42" << std::endl;
+    return ExecutionPlan();
+  }
+
+  {
+    const size_t a_shape[] = { 1, 40, 14, 14 };
+    const size_t b_shape[] = { 1, 40, 14, 14 };
+    status = xnn_setup_add_nd_f32(
+      op43,
+      4, a_shape, 4, b_shape,
+      v43.data() /* a */, v34.data() /* b */, v44.data() /* output */,
+      threadpool /* threadpool */);
+  }
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #43" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op44,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v44.data() /* input */, v45.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #44" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_hardswish_nc_f32(
+    op45,
+    196 /* batch size */,
+    v45.data() /* input */, v46.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #45" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op46,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v46.data() /* input */, v47.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #46" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_hardswish_nc_f32(
+    op47,
+    196 /* batch size */,
+    v47.data() /* input */, v48.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #47" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_global_average_pooling_ncw_f32(
+    op48,
+    1 /* batch size */, 196 /* width */,
+    v48.data() /* input */, v49.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #48" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op49,
+    1 /* batch size */, 1 /* input height */, 1 /* input width */,
+    v49.data() /* input */, v50.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #49" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op50,
+    1 /* batch size */, 1 /* input height */, 1 /* input width */,
+    v50.data() /* input */, v51.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #50" << std::endl;
+    return ExecutionPlan();
+  }
+
+  {
+    const size_t a_shape[] = { 1, 120, 14, 14 };
+    const size_t b_shape[] = { 1, 120, 1, 1 };
+    status = xnn_setup_multiply_nd_f32(
+      op51,
+      4, a_shape, 4, b_shape,
+      v48.data() /* a */, v51.data() /* b */, v52.data() /* output */,
+      threadpool /* threadpool */);
+  }
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #51" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op52,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v52.data() /* input */, v53.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #52" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op53,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v53.data() /* input */, v54.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #53" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_hardswish_nc_f32(
+    op54,
+    196 /* batch size */,
+    v54.data() /* input */, v55.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #54" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op55,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v55.data() /* input */, v56.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #55" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_hardswish_nc_f32(
+    op56,
+    196 /* batch size */,
+    v56.data() /* input */, v57.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #56" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_global_average_pooling_ncw_f32(
+    op57,
+    1 /* batch size */, 196 /* width */,
+    v57.data() /* input */, v58.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #57" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op58,
+    1 /* batch size */, 1 /* input height */, 1 /* input width */,
+    v58.data() /* input */, v59.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #58" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op59,
+    1 /* batch size */, 1 /* input height */, 1 /* input width */,
+    v59.data() /* input */, v60.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #59" << std::endl;
+    return ExecutionPlan();
+  }
+
+  {
+    const size_t a_shape[] = { 1, 144, 14, 14 };
+    const size_t b_shape[] = { 1, 144, 1, 1 };
+    status = xnn_setup_multiply_nd_f32(
+      op60,
+      4, a_shape, 4, b_shape,
+      v57.data() /* a */, v60.data() /* b */, v61.data() /* output */,
+      threadpool /* threadpool */);
+  }
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #60" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op61,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v61.data() /* input */, v62.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #61" << std::endl;
+    return ExecutionPlan();
+  }
+
+  {
+    const size_t a_shape[] = { 1, 48, 14, 14 };
+    const size_t b_shape[] = { 1, 48, 14, 14 };
+    status = xnn_setup_add_nd_f32(
+      op62,
+      4, a_shape, 4, b_shape,
+      v62.data() /* a */, v53.data() /* b */, v63.data() /* output */,
+      threadpool /* threadpool */);
+  }
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #62" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op63,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v63.data() /* input */, v64.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #63" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_hardswish_nc_f32(
+    op64,
+    196 /* batch size */,
+    v64.data() /* input */, v65.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #64" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op65,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v65.data() /* input */, v66.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #65" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_hardswish_nc_f32(
+    op66,
+    49 /* batch size */,
+    v66.data() /* input */, v67.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #66" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_global_average_pooling_ncw_f32(
+    op67,
+    1 /* batch size */, 49 /* width */,
+    v67.data() /* input */, v68.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #67" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op68,
+    1 /* batch size */, 1 /* input height */, 1 /* input width */,
+    v68.data() /* input */, v69.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #68" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op69,
+    1 /* batch size */, 1 /* input height */, 1 /* input width */,
+    v69.data() /* input */, v70.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #69" << std::endl;
+    return ExecutionPlan();
+  }
+
+  {
+    const size_t a_shape[] = { 1, 288, 7, 7 };
+    const size_t b_shape[] = { 1, 288, 1, 1 };
+    status = xnn_setup_multiply_nd_f32(
+      op70,
+      4, a_shape, 4, b_shape,
+      v67.data() /* a */, v70.data() /* b */, v71.data() /* output */,
+      threadpool /* threadpool */);
+  }
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #70" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op71,
+    1 /* batch size */, 7 /* input height */, 7 /* input width */,
+    v71.data() /* input */, v72.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #71" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op72,
+    1 /* batch size */, 7 /* input height */, 7 /* input width */,
+    v72.data() /* input */, v73.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #72" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_hardswish_nc_f32(
+    op73,
+    49 /* batch size */,
+    v73.data() /* input */, v74.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #73" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op74,
+    1 /* batch size */, 7 /* input height */, 7 /* input width */,
+    v74.data() /* input */, v75.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #74" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_hardswish_nc_f32(
+    op75,
+    49 /* batch size */,
+    v75.data() /* input */, v76.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #75" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_global_average_pooling_ncw_f32(
+    op76,
+    1 /* batch size */, 49 /* width */,
+    v76.data() /* input */, v77.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #76" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op77,
+    1 /* batch size */, 1 /* input height */, 1 /* input width */,
+    v77.data() /* input */, v78.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #77" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op78,
+    1 /* batch size */, 1 /* input height */, 1 /* input width */,
+    v78.data() /* input */, v79.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #78" << std::endl;
+    return ExecutionPlan();
+  }
+
+  {
+    const size_t a_shape[] = { 1, 576, 7, 7 };
+    const size_t b_shape[] = { 1, 576, 1, 1 };
+    status = xnn_setup_multiply_nd_f32(
+      op79,
+      4, a_shape, 4, b_shape,
+      v76.data() /* a */, v79.data() /* b */, v80.data() /* output */,
+      threadpool /* threadpool */);
+  }
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #79" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op80,
+    1 /* batch size */, 7 /* input height */, 7 /* input width */,
+    v80.data() /* input */, v81.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #80" << std::endl;
+    return ExecutionPlan();
+  }
+
+  {
+    const size_t a_shape[] = { 1, 96, 7, 7 };
+    const size_t b_shape[] = { 1, 96, 7, 7 };
+    status = xnn_setup_add_nd_f32(
+      op81,
+      4, a_shape, 4, b_shape,
+      v81.data() /* a */, v72.data() /* b */, v82.data() /* output */,
+      threadpool /* threadpool */);
+  }
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #81" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op82,
+    1 /* batch size */, 7 /* input height */, 7 /* input width */,
+    v82.data() /* input */, v83.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #82" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_hardswish_nc_f32(
+    op83,
+    49 /* batch size */,
+    v83.data() /* input */, v84.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #83" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op84,
+    1 /* batch size */, 7 /* input height */, 7 /* input width */,
+    v84.data() /* input */, v85.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #84" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_hardswish_nc_f32(
+    op85,
+    49 /* batch size */,
+    v85.data() /* input */, v86.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #85" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_global_average_pooling_ncw_f32(
+    op86,
+    1 /* batch size */, 49 /* width */,
+    v86.data() /* input */, v87.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #86" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op87,
+    1 /* batch size */, 1 /* input height */, 1 /* input width */,
+    v87.data() /* input */, v88.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #87" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op88,
+    1 /* batch size */, 1 /* input height */, 1 /* input width */,
+    v88.data() /* input */, v89.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #88" << std::endl;
+    return ExecutionPlan();
+  }
+
+  {
+    const size_t a_shape[] = { 1, 576, 7, 7 };
+    const size_t b_shape[] = { 1, 576, 1, 1 };
+    status = xnn_setup_multiply_nd_f32(
+      op89,
+      4, a_shape, 4, b_shape,
+      v86.data() /* a */, v89.data() /* b */, v90.data() /* output */,
+      threadpool /* threadpool */);
+  }
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #89" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op90,
+    1 /* batch size */, 7 /* input height */, 7 /* input width */,
+    v90.data() /* input */, v91.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #90" << std::endl;
+    return ExecutionPlan();
+  }
+
+  {
+    const size_t a_shape[] = { 1, 96, 7, 7 };
+    const size_t b_shape[] = { 1, 96, 7, 7 };
+    status = xnn_setup_add_nd_f32(
+      op91,
+      4, a_shape, 4, b_shape,
+      v91.data() /* a */, v82.data() /* b */, v92.data() /* output */,
+      threadpool /* threadpool */);
+  }
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #91" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nchw_f32(
+    op92,
+    1 /* batch size */, 7 /* input height */, 7 /* input width */,
+    v92.data() /* input */, v93.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #92" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_hardswish_nc_f32(
+    op93,
+    49 /* batch size */,
+    v93.data() /* input */, v94.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #93" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_global_average_pooling_ncw_f32(
+    op94,
+    1 /* batch size */, 49 /* width */,
+    v94.data() /* input */, v95.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #94" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nhwc_f32(
+    op95,
+    1 /* batch size */, 1 /* input height */, 1 /* input width */,
+    v95.data() /* input */, v96.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #95" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_hardswish_nc_f32(
+    op96,
+    1 /* batch size */,
+    v96.data() /* input */, v97.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #96" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_global_average_pooling_nwc_f32(
+    op97,
+    1 /* batch size */, 1 /* width */,
+    v97.data() /* input */, v98.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #97" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nhwc_f32(
+    op98,
+    1 /* batch size */, 1 /* input height */, 1 /* input width */,
+    v98.data() /* input */, v99.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #98" << std::endl;
+    return ExecutionPlan();
+  }
+
+  #pragma clang diagnostic push
+  #pragma clang diagnostic ignored "-Wpessimizing-move"
+  return operators;
+  #pragma clang diagnostic pop
+}
+
+}  // namespace models
diff --git a/models/models.h b/models/models.h
index 1315363..7306f59 100644
--- a/models/models.h
+++ b/models/models.h
@@ -20,6 +20,11 @@
 ExecutionPlan FP32MobileNetV3Large(pthreadpool_t threadpool);
 ExecutionPlan FP32MobileNetV3Small(pthreadpool_t threadpool);
 
+ExecutionPlan FP32SparseMobileNetV1(float sparsity, pthreadpool_t threadpool);
+ExecutionPlan FP32SparseMobileNetV2(float sparsity, pthreadpool_t threadpool);
+ExecutionPlan FP32SparseMobileNetV3Large(float sparsity, pthreadpool_t threadpool);
+ExecutionPlan FP32SparseMobileNetV3Small(float sparsity, pthreadpool_t threadpool);
+
 ExecutionPlan FP16MobileNetV1(pthreadpool_t threadpool);
 ExecutionPlan FP16MobileNetV2(pthreadpool_t threadpool);
 ExecutionPlan FP16MobileNetV3Large(pthreadpool_t threadpool);
@@ -28,4 +33,6 @@
 ExecutionPlan QS8MobileNetV1(pthreadpool_t threadpool);
 ExecutionPlan QS8MobileNetV2(pthreadpool_t threadpool);
 
+ExecutionPlan QU8MobileNetV1(pthreadpool_t threadpool);
+
 }  // namespace models
diff --git a/models/qs8-mobilenet-v1.cc b/models/qs8-mobilenet-v1.cc
index 5c17697..0df0bd4 100644
--- a/models/qs8-mobilenet-v1.cc
+++ b/models/qs8-mobilenet-v1.cc
@@ -5,6 +5,7 @@
 
 #include <xnnpack.h>
 
+#include <array>
 #include <algorithm>
 #include <functional>
 #include <iostream>
@@ -16,183 +17,183 @@
 namespace models {
 
 ExecutionPlan QS8MobileNetV1(pthreadpool_t threadpool) {
-  alignas(16) static int8_t v0[150528];
-  alignas(16) static int8_t v1[401408];
-  alignas(16) static int8_t v2[401408];
-  alignas(16) static int8_t v3[802816];
-  alignas(16) static int8_t v4[200704];
-  alignas(16) static int8_t v5[401408];
-  alignas(16) static int8_t v6[401408];
-  alignas(16) static int8_t v7[401408];
-  alignas(16) static int8_t v8[100352];
-  alignas(16) static int8_t v9[200704];
-  alignas(16) static int8_t v10[200704];
-  alignas(16) static int8_t v11[200704];
-  alignas(16) static int8_t v12[50176];
-  alignas(16) static int8_t v13[100352];
-  alignas(16) static int8_t v14[100352];
-  alignas(16) static int8_t v15[100352];
-  alignas(16) static int8_t v16[100352];
-  alignas(16) static int8_t v17[100352];
-  alignas(16) static int8_t v18[100352];
-  alignas(16) static int8_t v19[100352];
-  alignas(16) static int8_t v20[100352];
-  alignas(16) static int8_t v21[100352];
-  alignas(16) static int8_t v22[100352];
-  alignas(16) static int8_t v23[100352];
-  alignas(16) static int8_t v24[25088];
-  alignas(16) static int8_t v25[50176];
-  alignas(16) static int8_t v26[50176];
-  alignas(16) static int8_t v27[50176];
-  alignas(16) static int8_t v28[1024];
-  alignas(16) static int8_t v29[1001];
-  alignas(16) static int8_t w30[864];
-  alignas(16) static int32_t w31[32];
-  alignas(16) static int8_t w32[288];
-  alignas(16) static int32_t w33[32];
-  alignas(16) static int8_t w34[2048];
-  alignas(16) static int32_t w35[64];
-  alignas(16) static int8_t w36[576];
-  alignas(16) static int32_t w37[64];
-  alignas(16) static int8_t w38[8192];
-  alignas(16) static int32_t w39[128];
-  alignas(16) static int8_t w40[1152];
-  alignas(16) static int32_t w41[128];
-  alignas(16) static int8_t w42[16384];
-  alignas(16) static int32_t w43[128];
-  alignas(16) static int8_t w44[1152];
-  alignas(16) static int32_t w45[128];
-  alignas(16) static int8_t w46[32768];
-  alignas(16) static int32_t w47[256];
-  alignas(16) static int8_t w48[2304];
-  alignas(16) static int32_t w49[256];
-  alignas(16) static int8_t w50[65536];
-  alignas(16) static int32_t w51[256];
-  alignas(16) static int8_t w52[2304];
-  alignas(16) static int32_t w53[256];
-  alignas(16) static int8_t w54[131072];
-  alignas(16) static int32_t w55[512];
-  alignas(16) static int8_t w56[4608];
-  alignas(16) static int32_t w57[512];
-  alignas(16) static int8_t w58[262144];
-  alignas(16) static int32_t w59[512];
-  alignas(16) static int8_t w60[4608];
-  alignas(16) static int32_t w61[512];
-  alignas(16) static int8_t w62[262144];
-  alignas(16) static int32_t w63[512];
-  alignas(16) static int8_t w64[4608];
-  alignas(16) static int32_t w65[512];
-  alignas(16) static int8_t w66[262144];
-  alignas(16) static int32_t w67[512];
-  alignas(16) static int8_t w68[4608];
-  alignas(16) static int32_t w69[512];
-  alignas(16) static int8_t w70[262144];
-  alignas(16) static int32_t w71[512];
-  alignas(16) static int8_t w72[4608];
-  alignas(16) static int32_t w73[512];
-  alignas(16) static int8_t w74[262144];
-  alignas(16) static int32_t w75[512];
-  alignas(16) static int8_t w76[4608];
-  alignas(16) static int32_t w77[512];
-  alignas(16) static int8_t w78[524288];
-  alignas(16) static int32_t w79[1024];
-  alignas(16) static int8_t w80[9216];
-  alignas(16) static int32_t w81[1024];
-  alignas(16) static int8_t w82[1048576];
-  alignas(16) static int32_t w83[1024];
-  alignas(16) static int8_t w84[1025024];
-  alignas(16) static int32_t w85[1001];
+  alignas(16) static std::array<int8_t, 150528> v0;
+  alignas(16) static std::array<int8_t, 401408> v1;
+  alignas(16) static std::array<int8_t, 401408> v2;
+  alignas(16) static std::array<int8_t, 802816> v3;
+  alignas(16) static std::array<int8_t, 200704> v4;
+  alignas(16) static std::array<int8_t, 401408> v5;
+  alignas(16) static std::array<int8_t, 401408> v6;
+  alignas(16) static std::array<int8_t, 401408> v7;
+  alignas(16) static std::array<int8_t, 100352> v8;
+  alignas(16) static std::array<int8_t, 200704> v9;
+  alignas(16) static std::array<int8_t, 200704> v10;
+  alignas(16) static std::array<int8_t, 200704> v11;
+  alignas(16) static std::array<int8_t, 50176> v12;
+  alignas(16) static std::array<int8_t, 100352> v13;
+  alignas(16) static std::array<int8_t, 100352> v14;
+  alignas(16) static std::array<int8_t, 100352> v15;
+  alignas(16) static std::array<int8_t, 100352> v16;
+  alignas(16) static std::array<int8_t, 100352> v17;
+  alignas(16) static std::array<int8_t, 100352> v18;
+  alignas(16) static std::array<int8_t, 100352> v19;
+  alignas(16) static std::array<int8_t, 100352> v20;
+  alignas(16) static std::array<int8_t, 100352> v21;
+  alignas(16) static std::array<int8_t, 100352> v22;
+  alignas(16) static std::array<int8_t, 100352> v23;
+  alignas(16) static std::array<int8_t, 25088> v24;
+  alignas(16) static std::array<int8_t, 50176> v25;
+  alignas(16) static std::array<int8_t, 50176> v26;
+  alignas(16) static std::array<int8_t, 50176> v27;
+  alignas(16) static std::array<int8_t, 1024> v28;
+  alignas(16) static std::array<int8_t, 1001> v29;
+  alignas(16) static std::array<int8_t, 864> w30;
+  alignas(16) static std::array<int32_t, 32> w31;
+  alignas(16) static std::array<int8_t, 288> w32;
+  alignas(16) static std::array<int32_t, 32> w33;
+  alignas(16) static std::array<int8_t, 2048> w34;
+  alignas(16) static std::array<int32_t, 64> w35;
+  alignas(16) static std::array<int8_t, 576> w36;
+  alignas(16) static std::array<int32_t, 64> w37;
+  alignas(16) static std::array<int8_t, 8192> w38;
+  alignas(16) static std::array<int32_t, 128> w39;
+  alignas(16) static std::array<int8_t, 1152> w40;
+  alignas(16) static std::array<int32_t, 128> w41;
+  alignas(16) static std::array<int8_t, 16384> w42;
+  alignas(16) static std::array<int32_t, 128> w43;
+  alignas(16) static std::array<int8_t, 1152> w44;
+  alignas(16) static std::array<int32_t, 128> w45;
+  alignas(16) static std::array<int8_t, 32768> w46;
+  alignas(16) static std::array<int32_t, 256> w47;
+  alignas(16) static std::array<int8_t, 2304> w48;
+  alignas(16) static std::array<int32_t, 256> w49;
+  alignas(16) static std::array<int8_t, 65536> w50;
+  alignas(16) static std::array<int32_t, 256> w51;
+  alignas(16) static std::array<int8_t, 2304> w52;
+  alignas(16) static std::array<int32_t, 256> w53;
+  alignas(16) static std::array<int8_t, 131072> w54;
+  alignas(16) static std::array<int32_t, 512> w55;
+  alignas(16) static std::array<int8_t, 4608> w56;
+  alignas(16) static std::array<int32_t, 512> w57;
+  alignas(16) static std::array<int8_t, 262144> w58;
+  alignas(16) static std::array<int32_t, 512> w59;
+  alignas(16) static std::array<int8_t, 4608> w60;
+  alignas(16) static std::array<int32_t, 512> w61;
+  alignas(16) static std::array<int8_t, 262144> w62;
+  alignas(16) static std::array<int32_t, 512> w63;
+  alignas(16) static std::array<int8_t, 4608> w64;
+  alignas(16) static std::array<int32_t, 512> w65;
+  alignas(16) static std::array<int8_t, 262144> w66;
+  alignas(16) static std::array<int32_t, 512> w67;
+  alignas(16) static std::array<int8_t, 4608> w68;
+  alignas(16) static std::array<int32_t, 512> w69;
+  alignas(16) static std::array<int8_t, 262144> w70;
+  alignas(16) static std::array<int32_t, 512> w71;
+  alignas(16) static std::array<int8_t, 4608> w72;
+  alignas(16) static std::array<int32_t, 512> w73;
+  alignas(16) static std::array<int8_t, 262144> w74;
+  alignas(16) static std::array<int32_t, 512> w75;
+  alignas(16) static std::array<int8_t, 4608> w76;
+  alignas(16) static std::array<int32_t, 512> w77;
+  alignas(16) static std::array<int8_t, 524288> w78;
+  alignas(16) static std::array<int32_t, 1024> w79;
+  alignas(16) static std::array<int8_t, 9216> w80;
+  alignas(16) static std::array<int32_t, 1024> w81;
+  alignas(16) static std::array<int8_t, 1048576> w82;
+  alignas(16) static std::array<int32_t, 1024> w83;
+  alignas(16) static std::array<int8_t, 1025024> w84;
+  alignas(16) static std::array<int32_t, 1001> w85;
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto i8rng = std::bind(std::uniform_int_distribution<int32_t>(-127, 127), std::ref(rng));
   auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
-  std::generate(v0, v0 + 150528, std::ref(i8rng));
-  std::generate(v1, v1 + 401408, std::ref(i8rng));
-  std::generate(v2, v2 + 401408, std::ref(i8rng));
-  std::generate(v3, v3 + 802816, std::ref(i8rng));
-  std::generate(v4, v4 + 200704, std::ref(i8rng));
-  std::generate(v5, v5 + 401408, std::ref(i8rng));
-  std::generate(v6, v6 + 401408, std::ref(i8rng));
-  std::generate(v7, v7 + 401408, std::ref(i8rng));
-  std::generate(v8, v8 + 100352, std::ref(i8rng));
-  std::generate(v9, v9 + 200704, std::ref(i8rng));
-  std::generate(v10, v10 + 200704, std::ref(i8rng));
-  std::generate(v11, v11 + 200704, std::ref(i8rng));
-  std::generate(v12, v12 + 50176, std::ref(i8rng));
-  std::generate(v13, v13 + 100352, std::ref(i8rng));
-  std::generate(v14, v14 + 100352, std::ref(i8rng));
-  std::generate(v15, v15 + 100352, std::ref(i8rng));
-  std::generate(v16, v16 + 100352, std::ref(i8rng));
-  std::generate(v17, v17 + 100352, std::ref(i8rng));
-  std::generate(v18, v18 + 100352, std::ref(i8rng));
-  std::generate(v19, v19 + 100352, std::ref(i8rng));
-  std::generate(v20, v20 + 100352, std::ref(i8rng));
-  std::generate(v21, v21 + 100352, std::ref(i8rng));
-  std::generate(v22, v22 + 100352, std::ref(i8rng));
-  std::generate(v23, v23 + 100352, std::ref(i8rng));
-  std::generate(v24, v24 + 25088, std::ref(i8rng));
-  std::generate(v25, v25 + 50176, std::ref(i8rng));
-  std::generate(v26, v26 + 50176, std::ref(i8rng));
-  std::generate(v27, v27 + 50176, std::ref(i8rng));
-  std::generate(v28, v28 + 1024, std::ref(i8rng));
-  std::generate(v29, v29 + 1001, std::ref(i8rng));
-  std::generate(w30, w30 + 864, std::ref(i8rng));
-  std::generate(w31, w31 + 32, std::ref(i32rng));
-  std::generate(w32, w32 + 288, std::ref(i8rng));
-  std::generate(w33, w33 + 32, std::ref(i32rng));
-  std::generate(w34, w34 + 2048, std::ref(i8rng));
-  std::generate(w35, w35 + 64, std::ref(i32rng));
-  std::generate(w36, w36 + 576, std::ref(i8rng));
-  std::generate(w37, w37 + 64, std::ref(i32rng));
-  std::generate(w38, w38 + 8192, std::ref(i8rng));
-  std::generate(w39, w39 + 128, std::ref(i32rng));
-  std::generate(w40, w40 + 1152, std::ref(i8rng));
-  std::generate(w41, w41 + 128, std::ref(i32rng));
-  std::generate(w42, w42 + 16384, std::ref(i8rng));
-  std::generate(w43, w43 + 128, std::ref(i32rng));
-  std::generate(w44, w44 + 1152, std::ref(i8rng));
-  std::generate(w45, w45 + 128, std::ref(i32rng));
-  std::generate(w46, w46 + 32768, std::ref(i8rng));
-  std::generate(w47, w47 + 256, std::ref(i32rng));
-  std::generate(w48, w48 + 2304, std::ref(i8rng));
-  std::generate(w49, w49 + 256, std::ref(i32rng));
-  std::generate(w50, w50 + 65536, std::ref(i8rng));
-  std::generate(w51, w51 + 256, std::ref(i32rng));
-  std::generate(w52, w52 + 2304, std::ref(i8rng));
-  std::generate(w53, w53 + 256, std::ref(i32rng));
-  std::generate(w54, w54 + 131072, std::ref(i8rng));
-  std::generate(w55, w55 + 512, std::ref(i32rng));
-  std::generate(w56, w56 + 4608, std::ref(i8rng));
-  std::generate(w57, w57 + 512, std::ref(i32rng));
-  std::generate(w58, w58 + 262144, std::ref(i8rng));
-  std::generate(w59, w59 + 512, std::ref(i32rng));
-  std::generate(w60, w60 + 4608, std::ref(i8rng));
-  std::generate(w61, w61 + 512, std::ref(i32rng));
-  std::generate(w62, w62 + 262144, std::ref(i8rng));
-  std::generate(w63, w63 + 512, std::ref(i32rng));
-  std::generate(w64, w64 + 4608, std::ref(i8rng));
-  std::generate(w65, w65 + 512, std::ref(i32rng));
-  std::generate(w66, w66 + 262144, std::ref(i8rng));
-  std::generate(w67, w67 + 512, std::ref(i32rng));
-  std::generate(w68, w68 + 4608, std::ref(i8rng));
-  std::generate(w69, w69 + 512, std::ref(i32rng));
-  std::generate(w70, w70 + 262144, std::ref(i8rng));
-  std::generate(w71, w71 + 512, std::ref(i32rng));
-  std::generate(w72, w72 + 4608, std::ref(i8rng));
-  std::generate(w73, w73 + 512, std::ref(i32rng));
-  std::generate(w74, w74 + 262144, std::ref(i8rng));
-  std::generate(w75, w75 + 512, std::ref(i32rng));
-  std::generate(w76, w76 + 4608, std::ref(i8rng));
-  std::generate(w77, w77 + 512, std::ref(i32rng));
-  std::generate(w78, w78 + 524288, std::ref(i8rng));
-  std::generate(w79, w79 + 1024, std::ref(i32rng));
-  std::generate(w80, w80 + 9216, std::ref(i8rng));
-  std::generate(w81, w81 + 1024, std::ref(i32rng));
-  std::generate(w82, w82 + 1048576, std::ref(i8rng));
-  std::generate(w83, w83 + 1024, std::ref(i32rng));
-  std::generate(w84, w84 + 1025024, std::ref(i8rng));
-  std::generate(w85, w85 + 1001, std::ref(i32rng));
+  std::generate(v0.begin(), v0.end(), std::ref(i8rng));
+  std::generate(v1.begin(), v1.end(), std::ref(i8rng));
+  std::generate(v2.begin(), v2.end(), std::ref(i8rng));
+  std::generate(v3.begin(), v3.end(), std::ref(i8rng));
+  std::generate(v4.begin(), v4.end(), std::ref(i8rng));
+  std::generate(v5.begin(), v5.end(), std::ref(i8rng));
+  std::generate(v6.begin(), v6.end(), std::ref(i8rng));
+  std::generate(v7.begin(), v7.end(), std::ref(i8rng));
+  std::generate(v8.begin(), v8.end(), std::ref(i8rng));
+  std::generate(v9.begin(), v9.end(), std::ref(i8rng));
+  std::generate(v10.begin(), v10.end(), std::ref(i8rng));
+  std::generate(v11.begin(), v11.end(), std::ref(i8rng));
+  std::generate(v12.begin(), v12.end(), std::ref(i8rng));
+  std::generate(v13.begin(), v13.end(), std::ref(i8rng));
+  std::generate(v14.begin(), v14.end(), std::ref(i8rng));
+  std::generate(v15.begin(), v15.end(), std::ref(i8rng));
+  std::generate(v16.begin(), v16.end(), std::ref(i8rng));
+  std::generate(v17.begin(), v17.end(), std::ref(i8rng));
+  std::generate(v18.begin(), v18.end(), std::ref(i8rng));
+  std::generate(v19.begin(), v19.end(), std::ref(i8rng));
+  std::generate(v20.begin(), v20.end(), std::ref(i8rng));
+  std::generate(v21.begin(), v21.end(), std::ref(i8rng));
+  std::generate(v22.begin(), v22.end(), std::ref(i8rng));
+  std::generate(v23.begin(), v23.end(), std::ref(i8rng));
+  std::generate(v24.begin(), v24.end(), std::ref(i8rng));
+  std::generate(v25.begin(), v25.end(), std::ref(i8rng));
+  std::generate(v26.begin(), v26.end(), std::ref(i8rng));
+  std::generate(v27.begin(), v27.end(), std::ref(i8rng));
+  std::generate(v28.begin(), v28.end(), std::ref(i8rng));
+  std::generate(v29.begin(), v29.end(), std::ref(i8rng));
+  std::generate(w30.begin(), w30.end(), std::ref(i8rng));
+  std::generate(w31.begin(), w31.end(), std::ref(i32rng));
+  std::generate(w32.begin(), w32.end(), std::ref(i8rng));
+  std::generate(w33.begin(), w33.end(), std::ref(i32rng));
+  std::generate(w34.begin(), w34.end(), std::ref(i8rng));
+  std::generate(w35.begin(), w35.end(), std::ref(i32rng));
+  std::generate(w36.begin(), w36.end(), std::ref(i8rng));
+  std::generate(w37.begin(), w37.end(), std::ref(i32rng));
+  std::generate(w38.begin(), w38.end(), std::ref(i8rng));
+  std::generate(w39.begin(), w39.end(), std::ref(i32rng));
+  std::generate(w40.begin(), w40.end(), std::ref(i8rng));
+  std::generate(w41.begin(), w41.end(), std::ref(i32rng));
+  std::generate(w42.begin(), w42.end(), std::ref(i8rng));
+  std::generate(w43.begin(), w43.end(), std::ref(i32rng));
+  std::generate(w44.begin(), w44.end(), std::ref(i8rng));
+  std::generate(w45.begin(), w45.end(), std::ref(i32rng));
+  std::generate(w46.begin(), w46.end(), std::ref(i8rng));
+  std::generate(w47.begin(), w47.end(), std::ref(i32rng));
+  std::generate(w48.begin(), w48.end(), std::ref(i8rng));
+  std::generate(w49.begin(), w49.end(), std::ref(i32rng));
+  std::generate(w50.begin(), w50.end(), std::ref(i8rng));
+  std::generate(w51.begin(), w51.end(), std::ref(i32rng));
+  std::generate(w52.begin(), w52.end(), std::ref(i8rng));
+  std::generate(w53.begin(), w53.end(), std::ref(i32rng));
+  std::generate(w54.begin(), w54.end(), std::ref(i8rng));
+  std::generate(w55.begin(), w55.end(), std::ref(i32rng));
+  std::generate(w56.begin(), w56.end(), std::ref(i8rng));
+  std::generate(w57.begin(), w57.end(), std::ref(i32rng));
+  std::generate(w58.begin(), w58.end(), std::ref(i8rng));
+  std::generate(w59.begin(), w59.end(), std::ref(i32rng));
+  std::generate(w60.begin(), w60.end(), std::ref(i8rng));
+  std::generate(w61.begin(), w61.end(), std::ref(i32rng));
+  std::generate(w62.begin(), w62.end(), std::ref(i8rng));
+  std::generate(w63.begin(), w63.end(), std::ref(i32rng));
+  std::generate(w64.begin(), w64.end(), std::ref(i8rng));
+  std::generate(w65.begin(), w65.end(), std::ref(i32rng));
+  std::generate(w66.begin(), w66.end(), std::ref(i8rng));
+  std::generate(w67.begin(), w67.end(), std::ref(i32rng));
+  std::generate(w68.begin(), w68.end(), std::ref(i8rng));
+  std::generate(w69.begin(), w69.end(), std::ref(i32rng));
+  std::generate(w70.begin(), w70.end(), std::ref(i8rng));
+  std::generate(w71.begin(), w71.end(), std::ref(i32rng));
+  std::generate(w72.begin(), w72.end(), std::ref(i8rng));
+  std::generate(w73.begin(), w73.end(), std::ref(i32rng));
+  std::generate(w74.begin(), w74.end(), std::ref(i8rng));
+  std::generate(w75.begin(), w75.end(), std::ref(i32rng));
+  std::generate(w76.begin(), w76.end(), std::ref(i8rng));
+  std::generate(w77.begin(), w77.end(), std::ref(i32rng));
+  std::generate(w78.begin(), w78.end(), std::ref(i8rng));
+  std::generate(w79.begin(), w79.end(), std::ref(i32rng));
+  std::generate(w80.begin(), w80.end(), std::ref(i8rng));
+  std::generate(w81.begin(), w81.end(), std::ref(i32rng));
+  std::generate(w82.begin(), w82.end(), std::ref(i8rng));
+  std::generate(w83.begin(), w83.end(), std::ref(i32rng));
+  std::generate(w84.begin(), w84.end(), std::ref(i8rng));
+  std::generate(w85.begin(), w85.end(), std::ref(i32rng));
 
   ExecutionPlan operators;
   xnn_status status;
@@ -210,7 +211,7 @@
     3 /* input pixel stride */,
     32 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w30, w31,
+    w30.data(), w31.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op0);
@@ -233,7 +234,7 @@
     32 /* input pixel stride */,
     32 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w32, w33,
+    w32.data(), w33.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op1);
@@ -256,7 +257,7 @@
     32 /* input pixel stride */,
     64 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w34, w35,
+    w34.data(), w35.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op2);
@@ -279,7 +280,7 @@
     64 /* input pixel stride */,
     64 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w36, w37,
+    w36.data(), w37.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op3);
@@ -302,7 +303,7 @@
     64 /* input pixel stride */,
     128 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w38, w39,
+    w38.data(), w39.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op4);
@@ -325,7 +326,7 @@
     128 /* input pixel stride */,
     128 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w40, w41,
+    w40.data(), w41.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op5);
@@ -348,7 +349,7 @@
     128 /* input pixel stride */,
     128 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w42, w43,
+    w42.data(), w43.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op6);
@@ -371,7 +372,7 @@
     128 /* input pixel stride */,
     128 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w44, w45,
+    w44.data(), w45.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op7);
@@ -394,7 +395,7 @@
     128 /* input pixel stride */,
     256 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w46, w47,
+    w46.data(), w47.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op8);
@@ -417,7 +418,7 @@
     256 /* input pixel stride */,
     256 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w48, w49,
+    w48.data(), w49.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op9);
@@ -440,7 +441,7 @@
     256 /* input pixel stride */,
     256 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w50, w51,
+    w50.data(), w51.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op10);
@@ -463,7 +464,7 @@
     256 /* input pixel stride */,
     256 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w52, w53,
+    w52.data(), w53.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op11);
@@ -486,7 +487,7 @@
     256 /* input pixel stride */,
     512 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w54, w55,
+    w54.data(), w55.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op12);
@@ -509,7 +510,7 @@
     512 /* input pixel stride */,
     512 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w56, w57,
+    w56.data(), w57.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op13);
@@ -532,7 +533,7 @@
     512 /* input pixel stride */,
     512 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w58, w59,
+    w58.data(), w59.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op14);
@@ -555,7 +556,7 @@
     512 /* input pixel stride */,
     512 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w60, w61,
+    w60.data(), w61.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op15);
@@ -578,7 +579,7 @@
     512 /* input pixel stride */,
     512 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w62, w63,
+    w62.data(), w63.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op16);
@@ -601,7 +602,7 @@
     512 /* input pixel stride */,
     512 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w64, w65,
+    w64.data(), w65.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op17);
@@ -624,7 +625,7 @@
     512 /* input pixel stride */,
     512 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w66, w67,
+    w66.data(), w67.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op18);
@@ -647,7 +648,7 @@
     512 /* input pixel stride */,
     512 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w68, w69,
+    w68.data(), w69.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op19);
@@ -670,7 +671,7 @@
     512 /* input pixel stride */,
     512 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w70, w71,
+    w70.data(), w71.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op20);
@@ -693,7 +694,7 @@
     512 /* input pixel stride */,
     512 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w72, w73,
+    w72.data(), w73.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op21);
@@ -716,7 +717,7 @@
     512 /* input pixel stride */,
     512 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w74, w75,
+    w74.data(), w75.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op22);
@@ -739,7 +740,7 @@
     512 /* input pixel stride */,
     512 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w76, w77,
+    w76.data(), w77.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op23);
@@ -762,7 +763,7 @@
     512 /* input pixel stride */,
     1024 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w78, w79,
+    w78.data(), w79.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op24);
@@ -785,7 +786,7 @@
     1024 /* input pixel stride */,
     1024 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w80, w81,
+    w80.data(), w81.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op25);
@@ -808,7 +809,7 @@
     1024 /* input pixel stride */,
     1024 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w82, w83,
+    w82.data(), w83.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op26);
@@ -845,7 +846,7 @@
     1024 /* input pixel stride */,
     1001 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w84, w85,
+    w84.data(), w85.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op28);
@@ -860,7 +861,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op0,
     1 /* batch size */, 224 /* input height */, 224 /* input width */,
-    v0 /* input */, v1 /* output */,
+    v0.data() /* input */, v1.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #0" << std::endl;
@@ -870,7 +871,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op1,
     1 /* batch size */, 112 /* input height */, 112 /* input width */,
-    v1 /* input */, v2 /* output */,
+    v1.data() /* input */, v2.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #1" << std::endl;
@@ -880,7 +881,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op2,
     1 /* batch size */, 112 /* input height */, 112 /* input width */,
-    v2 /* input */, v3 /* output */,
+    v2.data() /* input */, v3.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #2" << std::endl;
@@ -890,7 +891,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op3,
     1 /* batch size */, 112 /* input height */, 112 /* input width */,
-    v3 /* input */, v4 /* output */,
+    v3.data() /* input */, v4.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #3" << std::endl;
@@ -900,7 +901,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op4,
     1 /* batch size */, 56 /* input height */, 56 /* input width */,
-    v4 /* input */, v5 /* output */,
+    v4.data() /* input */, v5.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #4" << std::endl;
@@ -910,7 +911,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op5,
     1 /* batch size */, 56 /* input height */, 56 /* input width */,
-    v5 /* input */, v6 /* output */,
+    v5.data() /* input */, v6.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #5" << std::endl;
@@ -920,7 +921,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op6,
     1 /* batch size */, 56 /* input height */, 56 /* input width */,
-    v6 /* input */, v7 /* output */,
+    v6.data() /* input */, v7.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #6" << std::endl;
@@ -930,7 +931,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op7,
     1 /* batch size */, 56 /* input height */, 56 /* input width */,
-    v7 /* input */, v8 /* output */,
+    v7.data() /* input */, v8.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #7" << std::endl;
@@ -940,7 +941,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op8,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v8 /* input */, v9 /* output */,
+    v8.data() /* input */, v9.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #8" << std::endl;
@@ -950,7 +951,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op9,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v9 /* input */, v10 /* output */,
+    v9.data() /* input */, v10.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #9" << std::endl;
@@ -960,7 +961,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op10,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v10 /* input */, v11 /* output */,
+    v10.data() /* input */, v11.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #10" << std::endl;
@@ -970,7 +971,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op11,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v11 /* input */, v12 /* output */,
+    v11.data() /* input */, v12.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #11" << std::endl;
@@ -980,7 +981,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op12,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v12 /* input */, v13 /* output */,
+    v12.data() /* input */, v13.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #12" << std::endl;
@@ -990,7 +991,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op13,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v13 /* input */, v14 /* output */,
+    v13.data() /* input */, v14.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #13" << std::endl;
@@ -1000,7 +1001,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op14,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v14 /* input */, v15 /* output */,
+    v14.data() /* input */, v15.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #14" << std::endl;
@@ -1010,7 +1011,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op15,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v15 /* input */, v16 /* output */,
+    v15.data() /* input */, v16.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #15" << std::endl;
@@ -1020,7 +1021,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op16,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v16 /* input */, v17 /* output */,
+    v16.data() /* input */, v17.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #16" << std::endl;
@@ -1030,7 +1031,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op17,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v17 /* input */, v18 /* output */,
+    v17.data() /* input */, v18.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #17" << std::endl;
@@ -1040,7 +1041,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op18,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v18 /* input */, v19 /* output */,
+    v18.data() /* input */, v19.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #18" << std::endl;
@@ -1050,7 +1051,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op19,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v19 /* input */, v20 /* output */,
+    v19.data() /* input */, v20.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #19" << std::endl;
@@ -1060,7 +1061,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op20,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v20 /* input */, v21 /* output */,
+    v20.data() /* input */, v21.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #20" << std::endl;
@@ -1070,7 +1071,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op21,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v21 /* input */, v22 /* output */,
+    v21.data() /* input */, v22.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #21" << std::endl;
@@ -1080,7 +1081,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op22,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v22 /* input */, v23 /* output */,
+    v22.data() /* input */, v23.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #22" << std::endl;
@@ -1090,7 +1091,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op23,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v23 /* input */, v24 /* output */,
+    v23.data() /* input */, v24.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #23" << std::endl;
@@ -1100,7 +1101,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op24,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v24 /* input */, v25 /* output */,
+    v24.data() /* input */, v25.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #24" << std::endl;
@@ -1110,7 +1111,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op25,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v25 /* input */, v26 /* output */,
+    v25.data() /* input */, v26.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #25" << std::endl;
@@ -1120,7 +1121,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op26,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v26 /* input */, v27 /* output */,
+    v26.data() /* input */, v27.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #26" << std::endl;
@@ -1130,7 +1131,7 @@
   status = xnn_setup_global_average_pooling_nwc_qs8(
     op27,
     1 /* batch size */, 49 /* width */,
-    v27 /* input */, v28 /* output */,
+    v27.data() /* input */, v28.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #27" << std::endl;
@@ -1140,7 +1141,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op28,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v28 /* input */, v29 /* output */,
+    v28.data() /* input */, v29.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #28" << std::endl;
diff --git a/models/qs8-mobilenet-v2.cc b/models/qs8-mobilenet-v2.cc
index 411a166..3061c72 100644
--- a/models/qs8-mobilenet-v2.cc
+++ b/models/qs8-mobilenet-v2.cc
@@ -5,6 +5,7 @@
 
 #include <xnnpack.h>
 
+#include <array>
 #include <algorithm>
 #include <functional>
 #include <iostream>
@@ -16,353 +17,353 @@
 namespace models {
 
 ExecutionPlan QS8MobileNetV2(pthreadpool_t threadpool) {
-  alignas(16) static int8_t v0[150528];
-  alignas(16) static int8_t v1[401408];
-  alignas(16) static int8_t v2[401408];
-  alignas(16) static int8_t v3[200704];
-  alignas(16) static int8_t v4[1204224];
-  alignas(16) static int8_t v5[301056];
-  alignas(16) static int8_t v6[75264];
-  alignas(16) static int8_t v7[451584];
-  alignas(16) static int8_t v8[451584];
-  alignas(16) static int8_t v9[75264];
-  alignas(16) static int8_t v10[75264];
-  alignas(16) static int8_t v11[451584];
-  alignas(16) static int8_t v12[112896];
-  alignas(16) static int8_t v13[25088];
-  alignas(16) static int8_t v14[150528];
-  alignas(16) static int8_t v15[150528];
-  alignas(16) static int8_t v16[25088];
-  alignas(16) static int8_t v17[25088];
-  alignas(16) static int8_t v18[150528];
-  alignas(16) static int8_t v19[150528];
-  alignas(16) static int8_t v20[25088];
-  alignas(16) static int8_t v21[25088];
-  alignas(16) static int8_t v22[150528];
-  alignas(16) static int8_t v23[37632];
-  alignas(16) static int8_t v24[12544];
-  alignas(16) static int8_t v25[75264];
-  alignas(16) static int8_t v26[75264];
-  alignas(16) static int8_t v27[12544];
-  alignas(16) static int8_t v28[12544];
-  alignas(16) static int8_t v29[75264];
-  alignas(16) static int8_t v30[75264];
-  alignas(16) static int8_t v31[12544];
-  alignas(16) static int8_t v32[12544];
-  alignas(16) static int8_t v33[75264];
-  alignas(16) static int8_t v34[75264];
-  alignas(16) static int8_t v35[12544];
-  alignas(16) static int8_t v36[12544];
-  alignas(16) static int8_t v37[75264];
-  alignas(16) static int8_t v38[75264];
-  alignas(16) static int8_t v39[18816];
-  alignas(16) static int8_t v40[112896];
-  alignas(16) static int8_t v41[112896];
-  alignas(16) static int8_t v42[18816];
-  alignas(16) static int8_t v43[18816];
-  alignas(16) static int8_t v44[112896];
-  alignas(16) static int8_t v45[112896];
-  alignas(16) static int8_t v46[18816];
-  alignas(16) static int8_t v47[18816];
-  alignas(16) static int8_t v48[112896];
-  alignas(16) static int8_t v49[28224];
-  alignas(16) static int8_t v50[7840];
-  alignas(16) static int8_t v51[47040];
-  alignas(16) static int8_t v52[47040];
-  alignas(16) static int8_t v53[7840];
-  alignas(16) static int8_t v54[7840];
-  alignas(16) static int8_t v55[47040];
-  alignas(16) static int8_t v56[47040];
-  alignas(16) static int8_t v57[7840];
-  alignas(16) static int8_t v58[7840];
-  alignas(16) static int8_t v59[47040];
-  alignas(16) static int8_t v60[47040];
-  alignas(16) static int8_t v61[15680];
-  alignas(16) static int8_t v62[62720];
-  alignas(16) static int8_t v63[1280];
-  alignas(16) static int8_t v64[1001];
-  alignas(16) static int8_t w65[864];
-  alignas(16) static int32_t w66[32];
-  alignas(16) static int8_t w67[288];
-  alignas(16) static int32_t w68[32];
-  alignas(16) static int8_t w69[512];
-  alignas(16) static int32_t w70[16];
-  alignas(16) static int8_t w71[1536];
-  alignas(16) static int32_t w72[96];
-  alignas(16) static int8_t w73[864];
-  alignas(16) static int32_t w74[96];
-  alignas(16) static int8_t w75[2304];
-  alignas(16) static int32_t w76[24];
-  alignas(16) static int8_t w77[3456];
-  alignas(16) static int32_t w78[144];
-  alignas(16) static int8_t w79[1296];
-  alignas(16) static int32_t w80[144];
-  alignas(16) static int8_t w81[3456];
-  alignas(16) static int32_t w82[24];
-  alignas(16) static int8_t w83[3456];
-  alignas(16) static int32_t w84[144];
-  alignas(16) static int8_t w85[1296];
-  alignas(16) static int32_t w86[144];
-  alignas(16) static int8_t w87[4608];
-  alignas(16) static int32_t w88[32];
-  alignas(16) static int8_t w89[6144];
-  alignas(16) static int32_t w90[192];
-  alignas(16) static int8_t w91[1728];
-  alignas(16) static int32_t w92[192];
-  alignas(16) static int8_t w93[6144];
-  alignas(16) static int32_t w94[32];
-  alignas(16) static int8_t w95[6144];
-  alignas(16) static int32_t w96[192];
-  alignas(16) static int8_t w97[1728];
-  alignas(16) static int32_t w98[192];
-  alignas(16) static int8_t w99[6144];
-  alignas(16) static int32_t w100[32];
-  alignas(16) static int8_t w101[6144];
-  alignas(16) static int32_t w102[192];
-  alignas(16) static int8_t w103[1728];
-  alignas(16) static int32_t w104[192];
-  alignas(16) static int8_t w105[12288];
-  alignas(16) static int32_t w106[64];
-  alignas(16) static int8_t w107[24576];
-  alignas(16) static int32_t w108[384];
-  alignas(16) static int8_t w109[3456];
-  alignas(16) static int32_t w110[384];
-  alignas(16) static int8_t w111[24576];
-  alignas(16) static int32_t w112[64];
-  alignas(16) static int8_t w113[24576];
-  alignas(16) static int32_t w114[384];
-  alignas(16) static int8_t w115[3456];
-  alignas(16) static int32_t w116[384];
-  alignas(16) static int8_t w117[24576];
-  alignas(16) static int32_t w118[64];
-  alignas(16) static int8_t w119[24576];
-  alignas(16) static int32_t w120[384];
-  alignas(16) static int8_t w121[3456];
-  alignas(16) static int32_t w122[384];
-  alignas(16) static int8_t w123[24576];
-  alignas(16) static int32_t w124[64];
-  alignas(16) static int8_t w125[24576];
-  alignas(16) static int32_t w126[384];
-  alignas(16) static int8_t w127[3456];
-  alignas(16) static int32_t w128[384];
-  alignas(16) static int8_t w129[36864];
-  alignas(16) static int32_t w130[96];
-  alignas(16) static int8_t w131[55296];
-  alignas(16) static int32_t w132[576];
-  alignas(16) static int8_t w133[5184];
-  alignas(16) static int32_t w134[576];
-  alignas(16) static int8_t w135[55296];
-  alignas(16) static int32_t w136[96];
-  alignas(16) static int8_t w137[55296];
-  alignas(16) static int32_t w138[576];
-  alignas(16) static int8_t w139[5184];
-  alignas(16) static int32_t w140[576];
-  alignas(16) static int8_t w141[55296];
-  alignas(16) static int32_t w142[96];
-  alignas(16) static int8_t w143[55296];
-  alignas(16) static int32_t w144[576];
-  alignas(16) static int8_t w145[5184];
-  alignas(16) static int32_t w146[576];
-  alignas(16) static int8_t w147[92160];
-  alignas(16) static int32_t w148[160];
-  alignas(16) static int8_t w149[153600];
-  alignas(16) static int32_t w150[960];
-  alignas(16) static int8_t w151[8640];
-  alignas(16) static int32_t w152[960];
-  alignas(16) static int8_t w153[153600];
-  alignas(16) static int32_t w154[160];
-  alignas(16) static int8_t w155[153600];
-  alignas(16) static int32_t w156[960];
-  alignas(16) static int8_t w157[8640];
-  alignas(16) static int32_t w158[960];
-  alignas(16) static int8_t w159[153600];
-  alignas(16) static int32_t w160[160];
-  alignas(16) static int8_t w161[153600];
-  alignas(16) static int32_t w162[960];
-  alignas(16) static int8_t w163[8640];
-  alignas(16) static int32_t w164[960];
-  alignas(16) static int8_t w165[307200];
-  alignas(16) static int32_t w166[320];
-  alignas(16) static int8_t w167[409600];
-  alignas(16) static int32_t w168[1280];
-  alignas(16) static int8_t w169[1281280];
-  alignas(16) static int32_t w170[1001];
+  alignas(16) static std::array<int8_t, 150528> v0;
+  alignas(16) static std::array<int8_t, 401408> v1;
+  alignas(16) static std::array<int8_t, 401408> v2;
+  alignas(16) static std::array<int8_t, 200704> v3;
+  alignas(16) static std::array<int8_t, 1204224> v4;
+  alignas(16) static std::array<int8_t, 301056> v5;
+  alignas(16) static std::array<int8_t, 75264> v6;
+  alignas(16) static std::array<int8_t, 451584> v7;
+  alignas(16) static std::array<int8_t, 451584> v8;
+  alignas(16) static std::array<int8_t, 75264> v9;
+  alignas(16) static std::array<int8_t, 75264> v10;
+  alignas(16) static std::array<int8_t, 451584> v11;
+  alignas(16) static std::array<int8_t, 112896> v12;
+  alignas(16) static std::array<int8_t, 25088> v13;
+  alignas(16) static std::array<int8_t, 150528> v14;
+  alignas(16) static std::array<int8_t, 150528> v15;
+  alignas(16) static std::array<int8_t, 25088> v16;
+  alignas(16) static std::array<int8_t, 25088> v17;
+  alignas(16) static std::array<int8_t, 150528> v18;
+  alignas(16) static std::array<int8_t, 150528> v19;
+  alignas(16) static std::array<int8_t, 25088> v20;
+  alignas(16) static std::array<int8_t, 25088> v21;
+  alignas(16) static std::array<int8_t, 150528> v22;
+  alignas(16) static std::array<int8_t, 37632> v23;
+  alignas(16) static std::array<int8_t, 12544> v24;
+  alignas(16) static std::array<int8_t, 75264> v25;
+  alignas(16) static std::array<int8_t, 75264> v26;
+  alignas(16) static std::array<int8_t, 12544> v27;
+  alignas(16) static std::array<int8_t, 12544> v28;
+  alignas(16) static std::array<int8_t, 75264> v29;
+  alignas(16) static std::array<int8_t, 75264> v30;
+  alignas(16) static std::array<int8_t, 12544> v31;
+  alignas(16) static std::array<int8_t, 12544> v32;
+  alignas(16) static std::array<int8_t, 75264> v33;
+  alignas(16) static std::array<int8_t, 75264> v34;
+  alignas(16) static std::array<int8_t, 12544> v35;
+  alignas(16) static std::array<int8_t, 12544> v36;
+  alignas(16) static std::array<int8_t, 75264> v37;
+  alignas(16) static std::array<int8_t, 75264> v38;
+  alignas(16) static std::array<int8_t, 18816> v39;
+  alignas(16) static std::array<int8_t, 112896> v40;
+  alignas(16) static std::array<int8_t, 112896> v41;
+  alignas(16) static std::array<int8_t, 18816> v42;
+  alignas(16) static std::array<int8_t, 18816> v43;
+  alignas(16) static std::array<int8_t, 112896> v44;
+  alignas(16) static std::array<int8_t, 112896> v45;
+  alignas(16) static std::array<int8_t, 18816> v46;
+  alignas(16) static std::array<int8_t, 18816> v47;
+  alignas(16) static std::array<int8_t, 112896> v48;
+  alignas(16) static std::array<int8_t, 28224> v49;
+  alignas(16) static std::array<int8_t, 7840> v50;
+  alignas(16) static std::array<int8_t, 47040> v51;
+  alignas(16) static std::array<int8_t, 47040> v52;
+  alignas(16) static std::array<int8_t, 7840> v53;
+  alignas(16) static std::array<int8_t, 7840> v54;
+  alignas(16) static std::array<int8_t, 47040> v55;
+  alignas(16) static std::array<int8_t, 47040> v56;
+  alignas(16) static std::array<int8_t, 7840> v57;
+  alignas(16) static std::array<int8_t, 7840> v58;
+  alignas(16) static std::array<int8_t, 47040> v59;
+  alignas(16) static std::array<int8_t, 47040> v60;
+  alignas(16) static std::array<int8_t, 15680> v61;
+  alignas(16) static std::array<int8_t, 62720> v62;
+  alignas(16) static std::array<int8_t, 1280> v63;
+  alignas(16) static std::array<int8_t, 1001> v64;
+  alignas(16) static std::array<int8_t, 864> w65;
+  alignas(16) static std::array<int32_t, 32> w66;
+  alignas(16) static std::array<int8_t, 288> w67;
+  alignas(16) static std::array<int32_t, 32> w68;
+  alignas(16) static std::array<int8_t, 512> w69;
+  alignas(16) static std::array<int32_t, 16> w70;
+  alignas(16) static std::array<int8_t, 1536> w71;
+  alignas(16) static std::array<int32_t, 96> w72;
+  alignas(16) static std::array<int8_t, 864> w73;
+  alignas(16) static std::array<int32_t, 96> w74;
+  alignas(16) static std::array<int8_t, 2304> w75;
+  alignas(16) static std::array<int32_t, 24> w76;
+  alignas(16) static std::array<int8_t, 3456> w77;
+  alignas(16) static std::array<int32_t, 144> w78;
+  alignas(16) static std::array<int8_t, 1296> w79;
+  alignas(16) static std::array<int32_t, 144> w80;
+  alignas(16) static std::array<int8_t, 3456> w81;
+  alignas(16) static std::array<int32_t, 24> w82;
+  alignas(16) static std::array<int8_t, 3456> w83;
+  alignas(16) static std::array<int32_t, 144> w84;
+  alignas(16) static std::array<int8_t, 1296> w85;
+  alignas(16) static std::array<int32_t, 144> w86;
+  alignas(16) static std::array<int8_t, 4608> w87;
+  alignas(16) static std::array<int32_t, 32> w88;
+  alignas(16) static std::array<int8_t, 6144> w89;
+  alignas(16) static std::array<int32_t, 192> w90;
+  alignas(16) static std::array<int8_t, 1728> w91;
+  alignas(16) static std::array<int32_t, 192> w92;
+  alignas(16) static std::array<int8_t, 6144> w93;
+  alignas(16) static std::array<int32_t, 32> w94;
+  alignas(16) static std::array<int8_t, 6144> w95;
+  alignas(16) static std::array<int32_t, 192> w96;
+  alignas(16) static std::array<int8_t, 1728> w97;
+  alignas(16) static std::array<int32_t, 192> w98;
+  alignas(16) static std::array<int8_t, 6144> w99;
+  alignas(16) static std::array<int32_t, 32> w100;
+  alignas(16) static std::array<int8_t, 6144> w101;
+  alignas(16) static std::array<int32_t, 192> w102;
+  alignas(16) static std::array<int8_t, 1728> w103;
+  alignas(16) static std::array<int32_t, 192> w104;
+  alignas(16) static std::array<int8_t, 12288> w105;
+  alignas(16) static std::array<int32_t, 64> w106;
+  alignas(16) static std::array<int8_t, 24576> w107;
+  alignas(16) static std::array<int32_t, 384> w108;
+  alignas(16) static std::array<int8_t, 3456> w109;
+  alignas(16) static std::array<int32_t, 384> w110;
+  alignas(16) static std::array<int8_t, 24576> w111;
+  alignas(16) static std::array<int32_t, 64> w112;
+  alignas(16) static std::array<int8_t, 24576> w113;
+  alignas(16) static std::array<int32_t, 384> w114;
+  alignas(16) static std::array<int8_t, 3456> w115;
+  alignas(16) static std::array<int32_t, 384> w116;
+  alignas(16) static std::array<int8_t, 24576> w117;
+  alignas(16) static std::array<int32_t, 64> w118;
+  alignas(16) static std::array<int8_t, 24576> w119;
+  alignas(16) static std::array<int32_t, 384> w120;
+  alignas(16) static std::array<int8_t, 3456> w121;
+  alignas(16) static std::array<int32_t, 384> w122;
+  alignas(16) static std::array<int8_t, 24576> w123;
+  alignas(16) static std::array<int32_t, 64> w124;
+  alignas(16) static std::array<int8_t, 24576> w125;
+  alignas(16) static std::array<int32_t, 384> w126;
+  alignas(16) static std::array<int8_t, 3456> w127;
+  alignas(16) static std::array<int32_t, 384> w128;
+  alignas(16) static std::array<int8_t, 36864> w129;
+  alignas(16) static std::array<int32_t, 96> w130;
+  alignas(16) static std::array<int8_t, 55296> w131;
+  alignas(16) static std::array<int32_t, 576> w132;
+  alignas(16) static std::array<int8_t, 5184> w133;
+  alignas(16) static std::array<int32_t, 576> w134;
+  alignas(16) static std::array<int8_t, 55296> w135;
+  alignas(16) static std::array<int32_t, 96> w136;
+  alignas(16) static std::array<int8_t, 55296> w137;
+  alignas(16) static std::array<int32_t, 576> w138;
+  alignas(16) static std::array<int8_t, 5184> w139;
+  alignas(16) static std::array<int32_t, 576> w140;
+  alignas(16) static std::array<int8_t, 55296> w141;
+  alignas(16) static std::array<int32_t, 96> w142;
+  alignas(16) static std::array<int8_t, 55296> w143;
+  alignas(16) static std::array<int32_t, 576> w144;
+  alignas(16) static std::array<int8_t, 5184> w145;
+  alignas(16) static std::array<int32_t, 576> w146;
+  alignas(16) static std::array<int8_t, 92160> w147;
+  alignas(16) static std::array<int32_t, 160> w148;
+  alignas(16) static std::array<int8_t, 153600> w149;
+  alignas(16) static std::array<int32_t, 960> w150;
+  alignas(16) static std::array<int8_t, 8640> w151;
+  alignas(16) static std::array<int32_t, 960> w152;
+  alignas(16) static std::array<int8_t, 153600> w153;
+  alignas(16) static std::array<int32_t, 160> w154;
+  alignas(16) static std::array<int8_t, 153600> w155;
+  alignas(16) static std::array<int32_t, 960> w156;
+  alignas(16) static std::array<int8_t, 8640> w157;
+  alignas(16) static std::array<int32_t, 960> w158;
+  alignas(16) static std::array<int8_t, 153600> w159;
+  alignas(16) static std::array<int32_t, 160> w160;
+  alignas(16) static std::array<int8_t, 153600> w161;
+  alignas(16) static std::array<int32_t, 960> w162;
+  alignas(16) static std::array<int8_t, 8640> w163;
+  alignas(16) static std::array<int32_t, 960> w164;
+  alignas(16) static std::array<int8_t, 307200> w165;
+  alignas(16) static std::array<int32_t, 320> w166;
+  alignas(16) static std::array<int8_t, 409600> w167;
+  alignas(16) static std::array<int32_t, 1280> w168;
+  alignas(16) static std::array<int8_t, 1281280> w169;
+  alignas(16) static std::array<int32_t, 1001> w170;
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto i8rng = std::bind(std::uniform_int_distribution<int32_t>(-127, 127), std::ref(rng));
   auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
-  std::generate(v0, v0 + 150528, std::ref(i8rng));
-  std::generate(v1, v1 + 401408, std::ref(i8rng));
-  std::generate(v2, v2 + 401408, std::ref(i8rng));
-  std::generate(v3, v3 + 200704, std::ref(i8rng));
-  std::generate(v4, v4 + 1204224, std::ref(i8rng));
-  std::generate(v5, v5 + 301056, std::ref(i8rng));
-  std::generate(v6, v6 + 75264, std::ref(i8rng));
-  std::generate(v7, v7 + 451584, std::ref(i8rng));
-  std::generate(v8, v8 + 451584, std::ref(i8rng));
-  std::generate(v9, v9 + 75264, std::ref(i8rng));
-  std::generate(v10, v10 + 75264, std::ref(i8rng));
-  std::generate(v11, v11 + 451584, std::ref(i8rng));
-  std::generate(v12, v12 + 112896, std::ref(i8rng));
-  std::generate(v13, v13 + 25088, std::ref(i8rng));
-  std::generate(v14, v14 + 150528, std::ref(i8rng));
-  std::generate(v15, v15 + 150528, std::ref(i8rng));
-  std::generate(v16, v16 + 25088, std::ref(i8rng));
-  std::generate(v17, v17 + 25088, std::ref(i8rng));
-  std::generate(v18, v18 + 150528, std::ref(i8rng));
-  std::generate(v19, v19 + 150528, std::ref(i8rng));
-  std::generate(v20, v20 + 25088, std::ref(i8rng));
-  std::generate(v21, v21 + 25088, std::ref(i8rng));
-  std::generate(v22, v22 + 150528, std::ref(i8rng));
-  std::generate(v23, v23 + 37632, std::ref(i8rng));
-  std::generate(v24, v24 + 12544, std::ref(i8rng));
-  std::generate(v25, v25 + 75264, std::ref(i8rng));
-  std::generate(v26, v26 + 75264, std::ref(i8rng));
-  std::generate(v27, v27 + 12544, std::ref(i8rng));
-  std::generate(v28, v28 + 12544, std::ref(i8rng));
-  std::generate(v29, v29 + 75264, std::ref(i8rng));
-  std::generate(v30, v30 + 75264, std::ref(i8rng));
-  std::generate(v31, v31 + 12544, std::ref(i8rng));
-  std::generate(v32, v32 + 12544, std::ref(i8rng));
-  std::generate(v33, v33 + 75264, std::ref(i8rng));
-  std::generate(v34, v34 + 75264, std::ref(i8rng));
-  std::generate(v35, v35 + 12544, std::ref(i8rng));
-  std::generate(v36, v36 + 12544, std::ref(i8rng));
-  std::generate(v37, v37 + 75264, std::ref(i8rng));
-  std::generate(v38, v38 + 75264, std::ref(i8rng));
-  std::generate(v39, v39 + 18816, std::ref(i8rng));
-  std::generate(v40, v40 + 112896, std::ref(i8rng));
-  std::generate(v41, v41 + 112896, std::ref(i8rng));
-  std::generate(v42, v42 + 18816, std::ref(i8rng));
-  std::generate(v43, v43 + 18816, std::ref(i8rng));
-  std::generate(v44, v44 + 112896, std::ref(i8rng));
-  std::generate(v45, v45 + 112896, std::ref(i8rng));
-  std::generate(v46, v46 + 18816, std::ref(i8rng));
-  std::generate(v47, v47 + 18816, std::ref(i8rng));
-  std::generate(v48, v48 + 112896, std::ref(i8rng));
-  std::generate(v49, v49 + 28224, std::ref(i8rng));
-  std::generate(v50, v50 + 7840, std::ref(i8rng));
-  std::generate(v51, v51 + 47040, std::ref(i8rng));
-  std::generate(v52, v52 + 47040, std::ref(i8rng));
-  std::generate(v53, v53 + 7840, std::ref(i8rng));
-  std::generate(v54, v54 + 7840, std::ref(i8rng));
-  std::generate(v55, v55 + 47040, std::ref(i8rng));
-  std::generate(v56, v56 + 47040, std::ref(i8rng));
-  std::generate(v57, v57 + 7840, std::ref(i8rng));
-  std::generate(v58, v58 + 7840, std::ref(i8rng));
-  std::generate(v59, v59 + 47040, std::ref(i8rng));
-  std::generate(v60, v60 + 47040, std::ref(i8rng));
-  std::generate(v61, v61 + 15680, std::ref(i8rng));
-  std::generate(v62, v62 + 62720, std::ref(i8rng));
-  std::generate(v63, v63 + 1280, std::ref(i8rng));
-  std::generate(v64, v64 + 1001, std::ref(i8rng));
-  std::generate(w65, w65 + 864, std::ref(i8rng));
-  std::generate(w66, w66 + 32, std::ref(i32rng));
-  std::generate(w67, w67 + 288, std::ref(i8rng));
-  std::generate(w68, w68 + 32, std::ref(i32rng));
-  std::generate(w69, w69 + 512, std::ref(i8rng));
-  std::generate(w70, w70 + 16, std::ref(i32rng));
-  std::generate(w71, w71 + 1536, std::ref(i8rng));
-  std::generate(w72, w72 + 96, std::ref(i32rng));
-  std::generate(w73, w73 + 864, std::ref(i8rng));
-  std::generate(w74, w74 + 96, std::ref(i32rng));
-  std::generate(w75, w75 + 2304, std::ref(i8rng));
-  std::generate(w76, w76 + 24, std::ref(i32rng));
-  std::generate(w77, w77 + 3456, std::ref(i8rng));
-  std::generate(w78, w78 + 144, std::ref(i32rng));
-  std::generate(w79, w79 + 1296, std::ref(i8rng));
-  std::generate(w80, w80 + 144, std::ref(i32rng));
-  std::generate(w81, w81 + 3456, std::ref(i8rng));
-  std::generate(w82, w82 + 24, std::ref(i32rng));
-  std::generate(w83, w83 + 3456, std::ref(i8rng));
-  std::generate(w84, w84 + 144, std::ref(i32rng));
-  std::generate(w85, w85 + 1296, std::ref(i8rng));
-  std::generate(w86, w86 + 144, std::ref(i32rng));
-  std::generate(w87, w87 + 4608, std::ref(i8rng));
-  std::generate(w88, w88 + 32, std::ref(i32rng));
-  std::generate(w89, w89 + 6144, std::ref(i8rng));
-  std::generate(w90, w90 + 192, std::ref(i32rng));
-  std::generate(w91, w91 + 1728, std::ref(i8rng));
-  std::generate(w92, w92 + 192, std::ref(i32rng));
-  std::generate(w93, w93 + 6144, std::ref(i8rng));
-  std::generate(w94, w94 + 32, std::ref(i32rng));
-  std::generate(w95, w95 + 6144, std::ref(i8rng));
-  std::generate(w96, w96 + 192, std::ref(i32rng));
-  std::generate(w97, w97 + 1728, std::ref(i8rng));
-  std::generate(w98, w98 + 192, std::ref(i32rng));
-  std::generate(w99, w99 + 6144, std::ref(i8rng));
-  std::generate(w100, w100 + 32, std::ref(i32rng));
-  std::generate(w101, w101 + 6144, std::ref(i8rng));
-  std::generate(w102, w102 + 192, std::ref(i32rng));
-  std::generate(w103, w103 + 1728, std::ref(i8rng));
-  std::generate(w104, w104 + 192, std::ref(i32rng));
-  std::generate(w105, w105 + 12288, std::ref(i8rng));
-  std::generate(w106, w106 + 64, std::ref(i32rng));
-  std::generate(w107, w107 + 24576, std::ref(i8rng));
-  std::generate(w108, w108 + 384, std::ref(i32rng));
-  std::generate(w109, w109 + 3456, std::ref(i8rng));
-  std::generate(w110, w110 + 384, std::ref(i32rng));
-  std::generate(w111, w111 + 24576, std::ref(i8rng));
-  std::generate(w112, w112 + 64, std::ref(i32rng));
-  std::generate(w113, w113 + 24576, std::ref(i8rng));
-  std::generate(w114, w114 + 384, std::ref(i32rng));
-  std::generate(w115, w115 + 3456, std::ref(i8rng));
-  std::generate(w116, w116 + 384, std::ref(i32rng));
-  std::generate(w117, w117 + 24576, std::ref(i8rng));
-  std::generate(w118, w118 + 64, std::ref(i32rng));
-  std::generate(w119, w119 + 24576, std::ref(i8rng));
-  std::generate(w120, w120 + 384, std::ref(i32rng));
-  std::generate(w121, w121 + 3456, std::ref(i8rng));
-  std::generate(w122, w122 + 384, std::ref(i32rng));
-  std::generate(w123, w123 + 24576, std::ref(i8rng));
-  std::generate(w124, w124 + 64, std::ref(i32rng));
-  std::generate(w125, w125 + 24576, std::ref(i8rng));
-  std::generate(w126, w126 + 384, std::ref(i32rng));
-  std::generate(w127, w127 + 3456, std::ref(i8rng));
-  std::generate(w128, w128 + 384, std::ref(i32rng));
-  std::generate(w129, w129 + 36864, std::ref(i8rng));
-  std::generate(w130, w130 + 96, std::ref(i32rng));
-  std::generate(w131, w131 + 55296, std::ref(i8rng));
-  std::generate(w132, w132 + 576, std::ref(i32rng));
-  std::generate(w133, w133 + 5184, std::ref(i8rng));
-  std::generate(w134, w134 + 576, std::ref(i32rng));
-  std::generate(w135, w135 + 55296, std::ref(i8rng));
-  std::generate(w136, w136 + 96, std::ref(i32rng));
-  std::generate(w137, w137 + 55296, std::ref(i8rng));
-  std::generate(w138, w138 + 576, std::ref(i32rng));
-  std::generate(w139, w139 + 5184, std::ref(i8rng));
-  std::generate(w140, w140 + 576, std::ref(i32rng));
-  std::generate(w141, w141 + 55296, std::ref(i8rng));
-  std::generate(w142, w142 + 96, std::ref(i32rng));
-  std::generate(w143, w143 + 55296, std::ref(i8rng));
-  std::generate(w144, w144 + 576, std::ref(i32rng));
-  std::generate(w145, w145 + 5184, std::ref(i8rng));
-  std::generate(w146, w146 + 576, std::ref(i32rng));
-  std::generate(w147, w147 + 92160, std::ref(i8rng));
-  std::generate(w148, w148 + 160, std::ref(i32rng));
-  std::generate(w149, w149 + 153600, std::ref(i8rng));
-  std::generate(w150, w150 + 960, std::ref(i32rng));
-  std::generate(w151, w151 + 8640, std::ref(i8rng));
-  std::generate(w152, w152 + 960, std::ref(i32rng));
-  std::generate(w153, w153 + 153600, std::ref(i8rng));
-  std::generate(w154, w154 + 160, std::ref(i32rng));
-  std::generate(w155, w155 + 153600, std::ref(i8rng));
-  std::generate(w156, w156 + 960, std::ref(i32rng));
-  std::generate(w157, w157 + 8640, std::ref(i8rng));
-  std::generate(w158, w158 + 960, std::ref(i32rng));
-  std::generate(w159, w159 + 153600, std::ref(i8rng));
-  std::generate(w160, w160 + 160, std::ref(i32rng));
-  std::generate(w161, w161 + 153600, std::ref(i8rng));
-  std::generate(w162, w162 + 960, std::ref(i32rng));
-  std::generate(w163, w163 + 8640, std::ref(i8rng));
-  std::generate(w164, w164 + 960, std::ref(i32rng));
-  std::generate(w165, w165 + 307200, std::ref(i8rng));
-  std::generate(w166, w166 + 320, std::ref(i32rng));
-  std::generate(w167, w167 + 409600, std::ref(i8rng));
-  std::generate(w168, w168 + 1280, std::ref(i32rng));
-  std::generate(w169, w169 + 1281280, std::ref(i8rng));
-  std::generate(w170, w170 + 1001, std::ref(i32rng));
+  std::generate(v0.begin(), v0.end(), std::ref(i8rng));
+  std::generate(v1.begin(), v1.end(), std::ref(i8rng));
+  std::generate(v2.begin(), v2.end(), std::ref(i8rng));
+  std::generate(v3.begin(), v3.end(), std::ref(i8rng));
+  std::generate(v4.begin(), v4.end(), std::ref(i8rng));
+  std::generate(v5.begin(), v5.end(), std::ref(i8rng));
+  std::generate(v6.begin(), v6.end(), std::ref(i8rng));
+  std::generate(v7.begin(), v7.end(), std::ref(i8rng));
+  std::generate(v8.begin(), v8.end(), std::ref(i8rng));
+  std::generate(v9.begin(), v9.end(), std::ref(i8rng));
+  std::generate(v10.begin(), v10.end(), std::ref(i8rng));
+  std::generate(v11.begin(), v11.end(), std::ref(i8rng));
+  std::generate(v12.begin(), v12.end(), std::ref(i8rng));
+  std::generate(v13.begin(), v13.end(), std::ref(i8rng));
+  std::generate(v14.begin(), v14.end(), std::ref(i8rng));
+  std::generate(v15.begin(), v15.end(), std::ref(i8rng));
+  std::generate(v16.begin(), v16.end(), std::ref(i8rng));
+  std::generate(v17.begin(), v17.end(), std::ref(i8rng));
+  std::generate(v18.begin(), v18.end(), std::ref(i8rng));
+  std::generate(v19.begin(), v19.end(), std::ref(i8rng));
+  std::generate(v20.begin(), v20.end(), std::ref(i8rng));
+  std::generate(v21.begin(), v21.end(), std::ref(i8rng));
+  std::generate(v22.begin(), v22.end(), std::ref(i8rng));
+  std::generate(v23.begin(), v23.end(), std::ref(i8rng));
+  std::generate(v24.begin(), v24.end(), std::ref(i8rng));
+  std::generate(v25.begin(), v25.end(), std::ref(i8rng));
+  std::generate(v26.begin(), v26.end(), std::ref(i8rng));
+  std::generate(v27.begin(), v27.end(), std::ref(i8rng));
+  std::generate(v28.begin(), v28.end(), std::ref(i8rng));
+  std::generate(v29.begin(), v29.end(), std::ref(i8rng));
+  std::generate(v30.begin(), v30.end(), std::ref(i8rng));
+  std::generate(v31.begin(), v31.end(), std::ref(i8rng));
+  std::generate(v32.begin(), v32.end(), std::ref(i8rng));
+  std::generate(v33.begin(), v33.end(), std::ref(i8rng));
+  std::generate(v34.begin(), v34.end(), std::ref(i8rng));
+  std::generate(v35.begin(), v35.end(), std::ref(i8rng));
+  std::generate(v36.begin(), v36.end(), std::ref(i8rng));
+  std::generate(v37.begin(), v37.end(), std::ref(i8rng));
+  std::generate(v38.begin(), v38.end(), std::ref(i8rng));
+  std::generate(v39.begin(), v39.end(), std::ref(i8rng));
+  std::generate(v40.begin(), v40.end(), std::ref(i8rng));
+  std::generate(v41.begin(), v41.end(), std::ref(i8rng));
+  std::generate(v42.begin(), v42.end(), std::ref(i8rng));
+  std::generate(v43.begin(), v43.end(), std::ref(i8rng));
+  std::generate(v44.begin(), v44.end(), std::ref(i8rng));
+  std::generate(v45.begin(), v45.end(), std::ref(i8rng));
+  std::generate(v46.begin(), v46.end(), std::ref(i8rng));
+  std::generate(v47.begin(), v47.end(), std::ref(i8rng));
+  std::generate(v48.begin(), v48.end(), std::ref(i8rng));
+  std::generate(v49.begin(), v49.end(), std::ref(i8rng));
+  std::generate(v50.begin(), v50.end(), std::ref(i8rng));
+  std::generate(v51.begin(), v51.end(), std::ref(i8rng));
+  std::generate(v52.begin(), v52.end(), std::ref(i8rng));
+  std::generate(v53.begin(), v53.end(), std::ref(i8rng));
+  std::generate(v54.begin(), v54.end(), std::ref(i8rng));
+  std::generate(v55.begin(), v55.end(), std::ref(i8rng));
+  std::generate(v56.begin(), v56.end(), std::ref(i8rng));
+  std::generate(v57.begin(), v57.end(), std::ref(i8rng));
+  std::generate(v58.begin(), v58.end(), std::ref(i8rng));
+  std::generate(v59.begin(), v59.end(), std::ref(i8rng));
+  std::generate(v60.begin(), v60.end(), std::ref(i8rng));
+  std::generate(v61.begin(), v61.end(), std::ref(i8rng));
+  std::generate(v62.begin(), v62.end(), std::ref(i8rng));
+  std::generate(v63.begin(), v63.end(), std::ref(i8rng));
+  std::generate(v64.begin(), v64.end(), std::ref(i8rng));
+  std::generate(w65.begin(), w65.end(), std::ref(i8rng));
+  std::generate(w66.begin(), w66.end(), std::ref(i32rng));
+  std::generate(w67.begin(), w67.end(), std::ref(i8rng));
+  std::generate(w68.begin(), w68.end(), std::ref(i32rng));
+  std::generate(w69.begin(), w69.end(), std::ref(i8rng));
+  std::generate(w70.begin(), w70.end(), std::ref(i32rng));
+  std::generate(w71.begin(), w71.end(), std::ref(i8rng));
+  std::generate(w72.begin(), w72.end(), std::ref(i32rng));
+  std::generate(w73.begin(), w73.end(), std::ref(i8rng));
+  std::generate(w74.begin(), w74.end(), std::ref(i32rng));
+  std::generate(w75.begin(), w75.end(), std::ref(i8rng));
+  std::generate(w76.begin(), w76.end(), std::ref(i32rng));
+  std::generate(w77.begin(), w77.end(), std::ref(i8rng));
+  std::generate(w78.begin(), w78.end(), std::ref(i32rng));
+  std::generate(w79.begin(), w79.end(), std::ref(i8rng));
+  std::generate(w80.begin(), w80.end(), std::ref(i32rng));
+  std::generate(w81.begin(), w81.end(), std::ref(i8rng));
+  std::generate(w82.begin(), w82.end(), std::ref(i32rng));
+  std::generate(w83.begin(), w83.end(), std::ref(i8rng));
+  std::generate(w84.begin(), w84.end(), std::ref(i32rng));
+  std::generate(w85.begin(), w85.end(), std::ref(i8rng));
+  std::generate(w86.begin(), w86.end(), std::ref(i32rng));
+  std::generate(w87.begin(), w87.end(), std::ref(i8rng));
+  std::generate(w88.begin(), w88.end(), std::ref(i32rng));
+  std::generate(w89.begin(), w89.end(), std::ref(i8rng));
+  std::generate(w90.begin(), w90.end(), std::ref(i32rng));
+  std::generate(w91.begin(), w91.end(), std::ref(i8rng));
+  std::generate(w92.begin(), w92.end(), std::ref(i32rng));
+  std::generate(w93.begin(), w93.end(), std::ref(i8rng));
+  std::generate(w94.begin(), w94.end(), std::ref(i32rng));
+  std::generate(w95.begin(), w95.end(), std::ref(i8rng));
+  std::generate(w96.begin(), w96.end(), std::ref(i32rng));
+  std::generate(w97.begin(), w97.end(), std::ref(i8rng));
+  std::generate(w98.begin(), w98.end(), std::ref(i32rng));
+  std::generate(w99.begin(), w99.end(), std::ref(i8rng));
+  std::generate(w100.begin(), w100.end(), std::ref(i32rng));
+  std::generate(w101.begin(), w101.end(), std::ref(i8rng));
+  std::generate(w102.begin(), w102.end(), std::ref(i32rng));
+  std::generate(w103.begin(), w103.end(), std::ref(i8rng));
+  std::generate(w104.begin(), w104.end(), std::ref(i32rng));
+  std::generate(w105.begin(), w105.end(), std::ref(i8rng));
+  std::generate(w106.begin(), w106.end(), std::ref(i32rng));
+  std::generate(w107.begin(), w107.end(), std::ref(i8rng));
+  std::generate(w108.begin(), w108.end(), std::ref(i32rng));
+  std::generate(w109.begin(), w109.end(), std::ref(i8rng));
+  std::generate(w110.begin(), w110.end(), std::ref(i32rng));
+  std::generate(w111.begin(), w111.end(), std::ref(i8rng));
+  std::generate(w112.begin(), w112.end(), std::ref(i32rng));
+  std::generate(w113.begin(), w113.end(), std::ref(i8rng));
+  std::generate(w114.begin(), w114.end(), std::ref(i32rng));
+  std::generate(w115.begin(), w115.end(), std::ref(i8rng));
+  std::generate(w116.begin(), w116.end(), std::ref(i32rng));
+  std::generate(w117.begin(), w117.end(), std::ref(i8rng));
+  std::generate(w118.begin(), w118.end(), std::ref(i32rng));
+  std::generate(w119.begin(), w119.end(), std::ref(i8rng));
+  std::generate(w120.begin(), w120.end(), std::ref(i32rng));
+  std::generate(w121.begin(), w121.end(), std::ref(i8rng));
+  std::generate(w122.begin(), w122.end(), std::ref(i32rng));
+  std::generate(w123.begin(), w123.end(), std::ref(i8rng));
+  std::generate(w124.begin(), w124.end(), std::ref(i32rng));
+  std::generate(w125.begin(), w125.end(), std::ref(i8rng));
+  std::generate(w126.begin(), w126.end(), std::ref(i32rng));
+  std::generate(w127.begin(), w127.end(), std::ref(i8rng));
+  std::generate(w128.begin(), w128.end(), std::ref(i32rng));
+  std::generate(w129.begin(), w129.end(), std::ref(i8rng));
+  std::generate(w130.begin(), w130.end(), std::ref(i32rng));
+  std::generate(w131.begin(), w131.end(), std::ref(i8rng));
+  std::generate(w132.begin(), w132.end(), std::ref(i32rng));
+  std::generate(w133.begin(), w133.end(), std::ref(i8rng));
+  std::generate(w134.begin(), w134.end(), std::ref(i32rng));
+  std::generate(w135.begin(), w135.end(), std::ref(i8rng));
+  std::generate(w136.begin(), w136.end(), std::ref(i32rng));
+  std::generate(w137.begin(), w137.end(), std::ref(i8rng));
+  std::generate(w138.begin(), w138.end(), std::ref(i32rng));
+  std::generate(w139.begin(), w139.end(), std::ref(i8rng));
+  std::generate(w140.begin(), w140.end(), std::ref(i32rng));
+  std::generate(w141.begin(), w141.end(), std::ref(i8rng));
+  std::generate(w142.begin(), w142.end(), std::ref(i32rng));
+  std::generate(w143.begin(), w143.end(), std::ref(i8rng));
+  std::generate(w144.begin(), w144.end(), std::ref(i32rng));
+  std::generate(w145.begin(), w145.end(), std::ref(i8rng));
+  std::generate(w146.begin(), w146.end(), std::ref(i32rng));
+  std::generate(w147.begin(), w147.end(), std::ref(i8rng));
+  std::generate(w148.begin(), w148.end(), std::ref(i32rng));
+  std::generate(w149.begin(), w149.end(), std::ref(i8rng));
+  std::generate(w150.begin(), w150.end(), std::ref(i32rng));
+  std::generate(w151.begin(), w151.end(), std::ref(i8rng));
+  std::generate(w152.begin(), w152.end(), std::ref(i32rng));
+  std::generate(w153.begin(), w153.end(), std::ref(i8rng));
+  std::generate(w154.begin(), w154.end(), std::ref(i32rng));
+  std::generate(w155.begin(), w155.end(), std::ref(i8rng));
+  std::generate(w156.begin(), w156.end(), std::ref(i32rng));
+  std::generate(w157.begin(), w157.end(), std::ref(i8rng));
+  std::generate(w158.begin(), w158.end(), std::ref(i32rng));
+  std::generate(w159.begin(), w159.end(), std::ref(i8rng));
+  std::generate(w160.begin(), w160.end(), std::ref(i32rng));
+  std::generate(w161.begin(), w161.end(), std::ref(i8rng));
+  std::generate(w162.begin(), w162.end(), std::ref(i32rng));
+  std::generate(w163.begin(), w163.end(), std::ref(i8rng));
+  std::generate(w164.begin(), w164.end(), std::ref(i32rng));
+  std::generate(w165.begin(), w165.end(), std::ref(i8rng));
+  std::generate(w166.begin(), w166.end(), std::ref(i32rng));
+  std::generate(w167.begin(), w167.end(), std::ref(i8rng));
+  std::generate(w168.begin(), w168.end(), std::ref(i32rng));
+  std::generate(w169.begin(), w169.end(), std::ref(i8rng));
+  std::generate(w170.begin(), w170.end(), std::ref(i32rng));
 
   ExecutionPlan operators;
   xnn_status status;
@@ -380,7 +381,7 @@
     3 /* input pixel stride */,
     32 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w65, w66,
+    w65.data(), w66.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op0);
@@ -403,7 +404,7 @@
     32 /* input pixel stride */,
     32 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w67, w68,
+    w67.data(), w68.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op1);
@@ -426,7 +427,7 @@
     32 /* input pixel stride */,
     16 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w69, w70,
+    w69.data(), w70.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op2);
@@ -449,7 +450,7 @@
     16 /* input pixel stride */,
     96 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w71, w72,
+    w71.data(), w72.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op3);
@@ -472,7 +473,7 @@
     96 /* input pixel stride */,
     96 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w73, w74,
+    w73.data(), w74.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op4);
@@ -495,7 +496,7 @@
     96 /* input pixel stride */,
     24 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w75, w76,
+    w75.data(), w76.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op5);
@@ -518,7 +519,7 @@
     24 /* input pixel stride */,
     144 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w77, w78,
+    w77.data(), w78.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op6);
@@ -541,7 +542,7 @@
     144 /* input pixel stride */,
     144 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w79, w80,
+    w79.data(), w80.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op7);
@@ -564,7 +565,7 @@
     144 /* input pixel stride */,
     24 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w81, w82,
+    w81.data(), w82.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op8);
@@ -600,7 +601,7 @@
     24 /* input pixel stride */,
     144 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w83, w84,
+    w83.data(), w84.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op10);
@@ -623,7 +624,7 @@
     144 /* input pixel stride */,
     144 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w85, w86,
+    w85.data(), w86.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op11);
@@ -646,7 +647,7 @@
     144 /* input pixel stride */,
     32 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w87, w88,
+    w87.data(), w88.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op12);
@@ -669,7 +670,7 @@
     32 /* input pixel stride */,
     192 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w89, w90,
+    w89.data(), w90.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op13);
@@ -692,7 +693,7 @@
     192 /* input pixel stride */,
     192 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w91, w92,
+    w91.data(), w92.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op14);
@@ -715,7 +716,7 @@
     192 /* input pixel stride */,
     32 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w93, w94,
+    w93.data(), w94.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op15);
@@ -751,7 +752,7 @@
     32 /* input pixel stride */,
     192 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w95, w96,
+    w95.data(), w96.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op17);
@@ -774,7 +775,7 @@
     192 /* input pixel stride */,
     192 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w97, w98,
+    w97.data(), w98.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op18);
@@ -797,7 +798,7 @@
     192 /* input pixel stride */,
     32 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w99, w100,
+    w99.data(), w100.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op19);
@@ -833,7 +834,7 @@
     32 /* input pixel stride */,
     192 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w101, w102,
+    w101.data(), w102.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op21);
@@ -856,7 +857,7 @@
     192 /* input pixel stride */,
     192 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w103, w104,
+    w103.data(), w104.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op22);
@@ -879,7 +880,7 @@
     192 /* input pixel stride */,
     64 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w105, w106,
+    w105.data(), w106.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op23);
@@ -902,7 +903,7 @@
     64 /* input pixel stride */,
     384 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w107, w108,
+    w107.data(), w108.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op24);
@@ -925,7 +926,7 @@
     384 /* input pixel stride */,
     384 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w109, w110,
+    w109.data(), w110.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op25);
@@ -948,7 +949,7 @@
     384 /* input pixel stride */,
     64 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w111, w112,
+    w111.data(), w112.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op26);
@@ -984,7 +985,7 @@
     64 /* input pixel stride */,
     384 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w113, w114,
+    w113.data(), w114.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op28);
@@ -1007,7 +1008,7 @@
     384 /* input pixel stride */,
     384 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w115, w116,
+    w115.data(), w116.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op29);
@@ -1030,7 +1031,7 @@
     384 /* input pixel stride */,
     64 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w117, w118,
+    w117.data(), w118.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op30);
@@ -1066,7 +1067,7 @@
     64 /* input pixel stride */,
     384 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w119, w120,
+    w119.data(), w120.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op32);
@@ -1089,7 +1090,7 @@
     384 /* input pixel stride */,
     384 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w121, w122,
+    w121.data(), w122.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op33);
@@ -1112,7 +1113,7 @@
     384 /* input pixel stride */,
     64 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w123, w124,
+    w123.data(), w124.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op34);
@@ -1148,7 +1149,7 @@
     64 /* input pixel stride */,
     384 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w125, w126,
+    w125.data(), w126.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op36);
@@ -1171,7 +1172,7 @@
     384 /* input pixel stride */,
     384 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w127, w128,
+    w127.data(), w128.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op37);
@@ -1194,7 +1195,7 @@
     384 /* input pixel stride */,
     96 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w129, w130,
+    w129.data(), w130.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op38);
@@ -1217,7 +1218,7 @@
     96 /* input pixel stride */,
     576 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w131, w132,
+    w131.data(), w132.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op39);
@@ -1240,7 +1241,7 @@
     576 /* input pixel stride */,
     576 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w133, w134,
+    w133.data(), w134.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op40);
@@ -1263,7 +1264,7 @@
     576 /* input pixel stride */,
     96 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w135, w136,
+    w135.data(), w136.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op41);
@@ -1299,7 +1300,7 @@
     96 /* input pixel stride */,
     576 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w137, w138,
+    w137.data(), w138.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op43);
@@ -1322,7 +1323,7 @@
     576 /* input pixel stride */,
     576 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w139, w140,
+    w139.data(), w140.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op44);
@@ -1345,7 +1346,7 @@
     576 /* input pixel stride */,
     96 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w141, w142,
+    w141.data(), w142.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op45);
@@ -1381,7 +1382,7 @@
     96 /* input pixel stride */,
     576 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w143, w144,
+    w143.data(), w144.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op47);
@@ -1404,7 +1405,7 @@
     576 /* input pixel stride */,
     576 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w145, w146,
+    w145.data(), w146.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op48);
@@ -1427,7 +1428,7 @@
     576 /* input pixel stride */,
     160 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w147, w148,
+    w147.data(), w148.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op49);
@@ -1450,7 +1451,7 @@
     160 /* input pixel stride */,
     960 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w149, w150,
+    w149.data(), w150.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op50);
@@ -1473,7 +1474,7 @@
     960 /* input pixel stride */,
     960 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w151, w152,
+    w151.data(), w152.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op51);
@@ -1496,7 +1497,7 @@
     960 /* input pixel stride */,
     160 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w153, w154,
+    w153.data(), w154.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op52);
@@ -1532,7 +1533,7 @@
     160 /* input pixel stride */,
     960 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w155, w156,
+    w155.data(), w156.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op54);
@@ -1555,7 +1556,7 @@
     960 /* input pixel stride */,
     960 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w157, w158,
+    w157.data(), w158.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op55);
@@ -1578,7 +1579,7 @@
     960 /* input pixel stride */,
     160 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w159, w160,
+    w159.data(), w160.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op56);
@@ -1614,7 +1615,7 @@
     160 /* input pixel stride */,
     960 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w161, w162,
+    w161.data(), w162.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op58);
@@ -1637,7 +1638,7 @@
     960 /* input pixel stride */,
     960 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w163, w164,
+    w163.data(), w164.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op59);
@@ -1660,7 +1661,7 @@
     960 /* input pixel stride */,
     320 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w165, w166,
+    w165.data(), w166.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op60);
@@ -1683,7 +1684,7 @@
     320 /* input pixel stride */,
     1280 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w167, w168,
+    w167.data(), w168.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op61);
@@ -1719,7 +1720,7 @@
     1280 /* input pixel stride */,
     1001 /* output pixel stride */,
     -1 /* input zero point */, 0.5f /* input scale */, 0.5f /* kernel scale */,
-    w169, w170,
+    w169.data(), w170.data(),
     -1 /* output zero point */, 0.5f /* output scale */, -126 /* output min */, 126 /* output max */,
     0 /* flags */,
     &op63);
@@ -1734,7 +1735,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op0,
     1 /* batch size */, 224 /* input height */, 224 /* input width */,
-    v0 /* input */, v1 /* output */,
+    v0.data() /* input */, v1.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #0" << std::endl;
@@ -1744,7 +1745,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op1,
     1 /* batch size */, 112 /* input height */, 112 /* input width */,
-    v1 /* input */, v2 /* output */,
+    v1.data() /* input */, v2.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #1" << std::endl;
@@ -1754,7 +1755,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op2,
     1 /* batch size */, 112 /* input height */, 112 /* input width */,
-    v2 /* input */, v3 /* output */,
+    v2.data() /* input */, v3.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #2" << std::endl;
@@ -1764,7 +1765,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op3,
     1 /* batch size */, 112 /* input height */, 112 /* input width */,
-    v3 /* input */, v4 /* output */,
+    v3.data() /* input */, v4.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #3" << std::endl;
@@ -1774,7 +1775,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op4,
     1 /* batch size */, 112 /* input height */, 112 /* input width */,
-    v4 /* input */, v5 /* output */,
+    v4.data() /* input */, v5.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #4" << std::endl;
@@ -1784,7 +1785,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op5,
     1 /* batch size */, 56 /* input height */, 56 /* input width */,
-    v5 /* input */, v6 /* output */,
+    v5.data() /* input */, v6.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #5" << std::endl;
@@ -1794,7 +1795,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op6,
     1 /* batch size */, 56 /* input height */, 56 /* input width */,
-    v6 /* input */, v7 /* output */,
+    v6.data() /* input */, v7.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #6" << std::endl;
@@ -1804,7 +1805,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op7,
     1 /* batch size */, 56 /* input height */, 56 /* input width */,
-    v7 /* input */, v8 /* output */,
+    v7.data() /* input */, v8.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #7" << std::endl;
@@ -1814,7 +1815,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op8,
     1 /* batch size */, 56 /* input height */, 56 /* input width */,
-    v8 /* input */, v9 /* output */,
+    v8.data() /* input */, v9.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #8" << std::endl;
@@ -1827,7 +1828,7 @@
     status = xnn_setup_add_nd_qs8(
       op9,
       4, a_shape, 4, b_shape,
-      v9 /* a */, v6 /* b */, v10 /* output */,
+      v9.data() /* a */, v6.data() /* b */, v10.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -1838,7 +1839,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op10,
     1 /* batch size */, 56 /* input height */, 56 /* input width */,
-    v10 /* input */, v11 /* output */,
+    v10.data() /* input */, v11.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #10" << std::endl;
@@ -1848,7 +1849,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op11,
     1 /* batch size */, 56 /* input height */, 56 /* input width */,
-    v11 /* input */, v12 /* output */,
+    v11.data() /* input */, v12.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #11" << std::endl;
@@ -1858,7 +1859,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op12,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v12 /* input */, v13 /* output */,
+    v12.data() /* input */, v13.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #12" << std::endl;
@@ -1868,7 +1869,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op13,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v13 /* input */, v14 /* output */,
+    v13.data() /* input */, v14.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #13" << std::endl;
@@ -1878,7 +1879,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op14,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v14 /* input */, v15 /* output */,
+    v14.data() /* input */, v15.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #14" << std::endl;
@@ -1888,7 +1889,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op15,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v15 /* input */, v16 /* output */,
+    v15.data() /* input */, v16.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #15" << std::endl;
@@ -1901,7 +1902,7 @@
     status = xnn_setup_add_nd_qs8(
       op16,
       4, a_shape, 4, b_shape,
-      v16 /* a */, v13 /* b */, v17 /* output */,
+      v16.data() /* a */, v13.data() /* b */, v17.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -1912,7 +1913,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op17,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v17 /* input */, v18 /* output */,
+    v17.data() /* input */, v18.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #17" << std::endl;
@@ -1922,7 +1923,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op18,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v18 /* input */, v19 /* output */,
+    v18.data() /* input */, v19.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #18" << std::endl;
@@ -1932,7 +1933,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op19,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v19 /* input */, v20 /* output */,
+    v19.data() /* input */, v20.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #19" << std::endl;
@@ -1945,7 +1946,7 @@
     status = xnn_setup_add_nd_qs8(
       op20,
       4, a_shape, 4, b_shape,
-      v20 /* a */, v17 /* b */, v21 /* output */,
+      v20.data() /* a */, v17.data() /* b */, v21.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -1956,7 +1957,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op21,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v21 /* input */, v22 /* output */,
+    v21.data() /* input */, v22.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #21" << std::endl;
@@ -1966,7 +1967,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op22,
     1 /* batch size */, 28 /* input height */, 28 /* input width */,
-    v22 /* input */, v23 /* output */,
+    v22.data() /* input */, v23.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #22" << std::endl;
@@ -1976,7 +1977,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op23,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v23 /* input */, v24 /* output */,
+    v23.data() /* input */, v24.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #23" << std::endl;
@@ -1986,7 +1987,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op24,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v24 /* input */, v25 /* output */,
+    v24.data() /* input */, v25.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #24" << std::endl;
@@ -1996,7 +1997,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op25,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v25 /* input */, v26 /* output */,
+    v25.data() /* input */, v26.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #25" << std::endl;
@@ -2006,7 +2007,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op26,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v26 /* input */, v27 /* output */,
+    v26.data() /* input */, v27.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #26" << std::endl;
@@ -2019,7 +2020,7 @@
     status = xnn_setup_add_nd_qs8(
       op27,
       4, a_shape, 4, b_shape,
-      v27 /* a */, v24 /* b */, v28 /* output */,
+      v27.data() /* a */, v24.data() /* b */, v28.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2030,7 +2031,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op28,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v28 /* input */, v29 /* output */,
+    v28.data() /* input */, v29.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #28" << std::endl;
@@ -2040,7 +2041,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op29,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v29 /* input */, v30 /* output */,
+    v29.data() /* input */, v30.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #29" << std::endl;
@@ -2050,7 +2051,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op30,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v30 /* input */, v31 /* output */,
+    v30.data() /* input */, v31.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #30" << std::endl;
@@ -2063,7 +2064,7 @@
     status = xnn_setup_add_nd_qs8(
       op31,
       4, a_shape, 4, b_shape,
-      v31 /* a */, v28 /* b */, v32 /* output */,
+      v31.data() /* a */, v28.data() /* b */, v32.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2074,7 +2075,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op32,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v32 /* input */, v33 /* output */,
+    v32.data() /* input */, v33.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #32" << std::endl;
@@ -2084,7 +2085,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op33,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v33 /* input */, v34 /* output */,
+    v33.data() /* input */, v34.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #33" << std::endl;
@@ -2094,7 +2095,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op34,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v34 /* input */, v35 /* output */,
+    v34.data() /* input */, v35.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #34" << std::endl;
@@ -2107,7 +2108,7 @@
     status = xnn_setup_add_nd_qs8(
       op35,
       4, a_shape, 4, b_shape,
-      v35 /* a */, v32 /* b */, v36 /* output */,
+      v35.data() /* a */, v32.data() /* b */, v36.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2118,7 +2119,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op36,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v36 /* input */, v37 /* output */,
+    v36.data() /* input */, v37.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #36" << std::endl;
@@ -2128,7 +2129,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op37,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v37 /* input */, v38 /* output */,
+    v37.data() /* input */, v38.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #37" << std::endl;
@@ -2138,7 +2139,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op38,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v38 /* input */, v39 /* output */,
+    v38.data() /* input */, v39.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #38" << std::endl;
@@ -2148,7 +2149,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op39,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v39 /* input */, v40 /* output */,
+    v39.data() /* input */, v40.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #39" << std::endl;
@@ -2158,7 +2159,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op40,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v40 /* input */, v41 /* output */,
+    v40.data() /* input */, v41.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #40" << std::endl;
@@ -2168,7 +2169,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op41,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v41 /* input */, v42 /* output */,
+    v41.data() /* input */, v42.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #41" << std::endl;
@@ -2181,7 +2182,7 @@
     status = xnn_setup_add_nd_qs8(
       op42,
       4, a_shape, 4, b_shape,
-      v42 /* a */, v39 /* b */, v43 /* output */,
+      v42.data() /* a */, v39.data() /* b */, v43.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2192,7 +2193,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op43,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v43 /* input */, v44 /* output */,
+    v43.data() /* input */, v44.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #43" << std::endl;
@@ -2202,7 +2203,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op44,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v44 /* input */, v45 /* output */,
+    v44.data() /* input */, v45.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #44" << std::endl;
@@ -2212,7 +2213,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op45,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v45 /* input */, v46 /* output */,
+    v45.data() /* input */, v46.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #45" << std::endl;
@@ -2225,7 +2226,7 @@
     status = xnn_setup_add_nd_qs8(
       op46,
       4, a_shape, 4, b_shape,
-      v46 /* a */, v43 /* b */, v47 /* output */,
+      v46.data() /* a */, v43.data() /* b */, v47.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2236,7 +2237,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op47,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v47 /* input */, v48 /* output */,
+    v47.data() /* input */, v48.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #47" << std::endl;
@@ -2246,7 +2247,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op48,
     1 /* batch size */, 14 /* input height */, 14 /* input width */,
-    v48 /* input */, v49 /* output */,
+    v48.data() /* input */, v49.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #48" << std::endl;
@@ -2256,7 +2257,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op49,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v49 /* input */, v50 /* output */,
+    v49.data() /* input */, v50.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #49" << std::endl;
@@ -2266,7 +2267,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op50,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v50 /* input */, v51 /* output */,
+    v50.data() /* input */, v51.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #50" << std::endl;
@@ -2276,7 +2277,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op51,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v51 /* input */, v52 /* output */,
+    v51.data() /* input */, v52.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #51" << std::endl;
@@ -2286,7 +2287,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op52,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v52 /* input */, v53 /* output */,
+    v52.data() /* input */, v53.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #52" << std::endl;
@@ -2299,7 +2300,7 @@
     status = xnn_setup_add_nd_qs8(
       op53,
       4, a_shape, 4, b_shape,
-      v53 /* a */, v50 /* b */, v54 /* output */,
+      v53.data() /* a */, v50.data() /* b */, v54.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2310,7 +2311,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op54,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v54 /* input */, v55 /* output */,
+    v54.data() /* input */, v55.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #54" << std::endl;
@@ -2320,7 +2321,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op55,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v55 /* input */, v56 /* output */,
+    v55.data() /* input */, v56.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #55" << std::endl;
@@ -2330,7 +2331,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op56,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v56 /* input */, v57 /* output */,
+    v56.data() /* input */, v57.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #56" << std::endl;
@@ -2343,7 +2344,7 @@
     status = xnn_setup_add_nd_qs8(
       op57,
       4, a_shape, 4, b_shape,
-      v57 /* a */, v54 /* b */, v58 /* output */,
+      v57.data() /* a */, v54.data() /* b */, v58.data() /* output */,
       threadpool /* threadpool */);
   }
   if (status != xnn_status_success) {
@@ -2354,7 +2355,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op58,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v58 /* input */, v59 /* output */,
+    v58.data() /* input */, v59.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #58" << std::endl;
@@ -2364,7 +2365,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op59,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v59 /* input */, v60 /* output */,
+    v59.data() /* input */, v60.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #59" << std::endl;
@@ -2374,7 +2375,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op60,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v60 /* input */, v61 /* output */,
+    v60.data() /* input */, v61.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #60" << std::endl;
@@ -2384,7 +2385,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op61,
     1 /* batch size */, 7 /* input height */, 7 /* input width */,
-    v61 /* input */, v62 /* output */,
+    v61.data() /* input */, v62.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #61" << std::endl;
@@ -2394,7 +2395,7 @@
   status = xnn_setup_global_average_pooling_nwc_qs8(
     op62,
     1 /* batch size */, 49 /* width */,
-    v62 /* input */, v63 /* output */,
+    v62.data() /* input */, v63.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #62" << std::endl;
@@ -2404,7 +2405,7 @@
   status = xnn_setup_convolution2d_nhwc_qs8(
     op63,
     1 /* batch size */, 1 /* input height */, 1 /* input width */,
-    v63 /* input */, v64 /* output */,
+    v63.data() /* input */, v64.data() /* output */,
     threadpool /* threadpool */);
   if (status != xnn_status_success) {
     std::cerr << "failed to setup operation #63" << std::endl;
diff --git a/models/qu8-mobilenet-v1.cc b/models/qu8-mobilenet-v1.cc
new file mode 100644
index 0000000..ce3c61b
--- /dev/null
+++ b/models/qu8-mobilenet-v1.cc
@@ -0,0 +1,1157 @@
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack.h>
+
+#include <array>
+#include <algorithm>
+#include <functional>
+#include <iostream>
+#include <limits>
+#include <random>
+
+#include "models/models.h"
+
+namespace models {
+
+ExecutionPlan QU8MobileNetV1(pthreadpool_t threadpool) {
+  alignas(16) static std::array<uint8_t, 150528> v0;
+  alignas(16) static std::array<uint8_t, 401408> v1;
+  alignas(16) static std::array<uint8_t, 401408> v2;
+  alignas(16) static std::array<uint8_t, 802816> v3;
+  alignas(16) static std::array<uint8_t, 200704> v4;
+  alignas(16) static std::array<uint8_t, 401408> v5;
+  alignas(16) static std::array<uint8_t, 401408> v6;
+  alignas(16) static std::array<uint8_t, 401408> v7;
+  alignas(16) static std::array<uint8_t, 100352> v8;
+  alignas(16) static std::array<uint8_t, 200704> v9;
+  alignas(16) static std::array<uint8_t, 200704> v10;
+  alignas(16) static std::array<uint8_t, 200704> v11;
+  alignas(16) static std::array<uint8_t, 50176> v12;
+  alignas(16) static std::array<uint8_t, 100352> v13;
+  alignas(16) static std::array<uint8_t, 100352> v14;
+  alignas(16) static std::array<uint8_t, 100352> v15;
+  alignas(16) static std::array<uint8_t, 100352> v16;
+  alignas(16) static std::array<uint8_t, 100352> v17;
+  alignas(16) static std::array<uint8_t, 100352> v18;
+  alignas(16) static std::array<uint8_t, 100352> v19;
+  alignas(16) static std::array<uint8_t, 100352> v20;
+  alignas(16) static std::array<uint8_t, 100352> v21;
+  alignas(16) static std::array<uint8_t, 100352> v22;
+  alignas(16) static std::array<uint8_t, 100352> v23;
+  alignas(16) static std::array<uint8_t, 25088> v24;
+  alignas(16) static std::array<uint8_t, 50176> v25;
+  alignas(16) static std::array<uint8_t, 50176> v26;
+  alignas(16) static std::array<uint8_t, 50176> v27;
+  alignas(16) static std::array<uint8_t, 1024> v28;
+  alignas(16) static std::array<uint8_t, 1001> v29;
+  alignas(16) static std::array<uint8_t, 864> w30;
+  alignas(16) static std::array<int32_t, 32> w31;
+  alignas(16) static std::array<uint8_t, 288> w32;
+  alignas(16) static std::array<int32_t, 32> w33;
+  alignas(16) static std::array<uint8_t, 2048> w34;
+  alignas(16) static std::array<int32_t, 64> w35;
+  alignas(16) static std::array<uint8_t, 576> w36;
+  alignas(16) static std::array<int32_t, 64> w37;
+  alignas(16) static std::array<uint8_t, 8192> w38;
+  alignas(16) static std::array<int32_t, 128> w39;
+  alignas(16) static std::array<uint8_t, 1152> w40;
+  alignas(16) static std::array<int32_t, 128> w41;
+  alignas(16) static std::array<uint8_t, 16384> w42;
+  alignas(16) static std::array<int32_t, 128> w43;
+  alignas(16) static std::array<uint8_t, 1152> w44;
+  alignas(16) static std::array<int32_t, 128> w45;
+  alignas(16) static std::array<uint8_t, 32768> w46;
+  alignas(16) static std::array<int32_t, 256> w47;
+  alignas(16) static std::array<uint8_t, 2304> w48;
+  alignas(16) static std::array<int32_t, 256> w49;
+  alignas(16) static std::array<uint8_t, 65536> w50;
+  alignas(16) static std::array<int32_t, 256> w51;
+  alignas(16) static std::array<uint8_t, 2304> w52;
+  alignas(16) static std::array<int32_t, 256> w53;
+  alignas(16) static std::array<uint8_t, 131072> w54;
+  alignas(16) static std::array<int32_t, 512> w55;
+  alignas(16) static std::array<uint8_t, 4608> w56;
+  alignas(16) static std::array<int32_t, 512> w57;
+  alignas(16) static std::array<uint8_t, 262144> w58;
+  alignas(16) static std::array<int32_t, 512> w59;
+  alignas(16) static std::array<uint8_t, 4608> w60;
+  alignas(16) static std::array<int32_t, 512> w61;
+  alignas(16) static std::array<uint8_t, 262144> w62;
+  alignas(16) static std::array<int32_t, 512> w63;
+  alignas(16) static std::array<uint8_t, 4608> w64;
+  alignas(16) static std::array<int32_t, 512> w65;
+  alignas(16) static std::array<uint8_t, 262144> w66;
+  alignas(16) static std::array<int32_t, 512> w67;
+  alignas(16) static std::array<uint8_t, 4608> w68;
+  alignas(16) static std::array<int32_t, 512> w69;
+  alignas(16) static std::array<uint8_t, 262144> w70;
+  alignas(16) static std::array<int32_t, 512> w71;
+  alignas(16) static std::array<uint8_t, 4608> w72;
+  alignas(16) static std::array<int32_t, 512> w73;
+  alignas(16) static std::array<uint8_t, 262144> w74;
+  alignas(16) static std::array<int32_t, 512> w75;
+  alignas(16) static std::array<uint8_t, 4608> w76;
+  alignas(16) static std::array<int32_t, 512> w77;
+  alignas(16) static std::array<uint8_t, 524288> w78;
+  alignas(16) static std::array<int32_t, 1024> w79;
+  alignas(16) static std::array<uint8_t, 9216> w80;
+  alignas(16) static std::array<int32_t, 1024> w81;
+  alignas(16) static std::array<uint8_t, 1048576> w82;
+  alignas(16) static std::array<int32_t, 1024> w83;
+  alignas(16) static std::array<uint8_t, 1025024> w84;
+  alignas(16) static std::array<int32_t, 1001> w85;
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, 255), std::ref(rng));
+  auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
+  std::generate(v0.begin(), v0.end(), std::ref(u8rng));
+  std::generate(v1.begin(), v1.end(), std::ref(u8rng));
+  std::generate(v2.begin(), v2.end(), std::ref(u8rng));
+  std::generate(v3.begin(), v3.end(), std::ref(u8rng));
+  std::generate(v4.begin(), v4.end(), std::ref(u8rng));
+  std::generate(v5.begin(), v5.end(), std::ref(u8rng));
+  std::generate(v6.begin(), v6.end(), std::ref(u8rng));
+  std::generate(v7.begin(), v7.end(), std::ref(u8rng));
+  std::generate(v8.begin(), v8.end(), std::ref(u8rng));
+  std::generate(v9.begin(), v9.end(), std::ref(u8rng));
+  std::generate(v10.begin(), v10.end(), std::ref(u8rng));
+  std::generate(v11.begin(), v11.end(), std::ref(u8rng));
+  std::generate(v12.begin(), v12.end(), std::ref(u8rng));
+  std::generate(v13.begin(), v13.end(), std::ref(u8rng));
+  std::generate(v14.begin(), v14.end(), std::ref(u8rng));
+  std::generate(v15.begin(), v15.end(), std::ref(u8rng));
+  std::generate(v16.begin(), v16.end(), std::ref(u8rng));
+  std::generate(v17.begin(), v17.end(), std::ref(u8rng));
+  std::generate(v18.begin(), v18.end(), std::ref(u8rng));
+  std::generate(v19.begin(), v19.end(), std::ref(u8rng));
+  std::generate(v20.begin(), v20.end(), std::ref(u8rng));
+  std::generate(v21.begin(), v21.end(), std::ref(u8rng));
+  std::generate(v22.begin(), v22.end(), std::ref(u8rng));
+  std::generate(v23.begin(), v23.end(), std::ref(u8rng));
+  std::generate(v24.begin(), v24.end(), std::ref(u8rng));
+  std::generate(v25.begin(), v25.end(), std::ref(u8rng));
+  std::generate(v26.begin(), v26.end(), std::ref(u8rng));
+  std::generate(v27.begin(), v27.end(), std::ref(u8rng));
+  std::generate(v28.begin(), v28.end(), std::ref(u8rng));
+  std::generate(v29.begin(), v29.end(), std::ref(u8rng));
+  std::generate(w30.begin(), w30.end(), std::ref(u8rng));
+  std::generate(w31.begin(), w31.end(), std::ref(i32rng));
+  std::generate(w32.begin(), w32.end(), std::ref(u8rng));
+  std::generate(w33.begin(), w33.end(), std::ref(i32rng));
+  std::generate(w34.begin(), w34.end(), std::ref(u8rng));
+  std::generate(w35.begin(), w35.end(), std::ref(i32rng));
+  std::generate(w36.begin(), w36.end(), std::ref(u8rng));
+  std::generate(w37.begin(), w37.end(), std::ref(i32rng));
+  std::generate(w38.begin(), w38.end(), std::ref(u8rng));
+  std::generate(w39.begin(), w39.end(), std::ref(i32rng));
+  std::generate(w40.begin(), w40.end(), std::ref(u8rng));
+  std::generate(w41.begin(), w41.end(), std::ref(i32rng));
+  std::generate(w42.begin(), w42.end(), std::ref(u8rng));
+  std::generate(w43.begin(), w43.end(), std::ref(i32rng));
+  std::generate(w44.begin(), w44.end(), std::ref(u8rng));
+  std::generate(w45.begin(), w45.end(), std::ref(i32rng));
+  std::generate(w46.begin(), w46.end(), std::ref(u8rng));
+  std::generate(w47.begin(), w47.end(), std::ref(i32rng));
+  std::generate(w48.begin(), w48.end(), std::ref(u8rng));
+  std::generate(w49.begin(), w49.end(), std::ref(i32rng));
+  std::generate(w50.begin(), w50.end(), std::ref(u8rng));
+  std::generate(w51.begin(), w51.end(), std::ref(i32rng));
+  std::generate(w52.begin(), w52.end(), std::ref(u8rng));
+  std::generate(w53.begin(), w53.end(), std::ref(i32rng));
+  std::generate(w54.begin(), w54.end(), std::ref(u8rng));
+  std::generate(w55.begin(), w55.end(), std::ref(i32rng));
+  std::generate(w56.begin(), w56.end(), std::ref(u8rng));
+  std::generate(w57.begin(), w57.end(), std::ref(i32rng));
+  std::generate(w58.begin(), w58.end(), std::ref(u8rng));
+  std::generate(w59.begin(), w59.end(), std::ref(i32rng));
+  std::generate(w60.begin(), w60.end(), std::ref(u8rng));
+  std::generate(w61.begin(), w61.end(), std::ref(i32rng));
+  std::generate(w62.begin(), w62.end(), std::ref(u8rng));
+  std::generate(w63.begin(), w63.end(), std::ref(i32rng));
+  std::generate(w64.begin(), w64.end(), std::ref(u8rng));
+  std::generate(w65.begin(), w65.end(), std::ref(i32rng));
+  std::generate(w66.begin(), w66.end(), std::ref(u8rng));
+  std::generate(w67.begin(), w67.end(), std::ref(i32rng));
+  std::generate(w68.begin(), w68.end(), std::ref(u8rng));
+  std::generate(w69.begin(), w69.end(), std::ref(i32rng));
+  std::generate(w70.begin(), w70.end(), std::ref(u8rng));
+  std::generate(w71.begin(), w71.end(), std::ref(i32rng));
+  std::generate(w72.begin(), w72.end(), std::ref(u8rng));
+  std::generate(w73.begin(), w73.end(), std::ref(i32rng));
+  std::generate(w74.begin(), w74.end(), std::ref(u8rng));
+  std::generate(w75.begin(), w75.end(), std::ref(i32rng));
+  std::generate(w76.begin(), w76.end(), std::ref(u8rng));
+  std::generate(w77.begin(), w77.end(), std::ref(i32rng));
+  std::generate(w78.begin(), w78.end(), std::ref(u8rng));
+  std::generate(w79.begin(), w79.end(), std::ref(i32rng));
+  std::generate(w80.begin(), w80.end(), std::ref(u8rng));
+  std::generate(w81.begin(), w81.end(), std::ref(i32rng));
+  std::generate(w82.begin(), w82.end(), std::ref(u8rng));
+  std::generate(w83.begin(), w83.end(), std::ref(i32rng));
+  std::generate(w84.begin(), w84.end(), std::ref(u8rng));
+  std::generate(w85.begin(), w85.end(), std::ref(i32rng));
+
+  ExecutionPlan operators;
+  xnn_status status;
+
+  xnn_operator_t op0 = nullptr;
+  status = xnn_create_convolution2d_nhwc_qu8(
+    0 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 0 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    2 /* subsampling height */, 2 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    3 /* input channels per group */,
+    32 /* output_channels_per_group */,
+    3 /* input pixel stride */,
+    32 /* output pixel stride */,
+    127 /* input zero point */, 0.5f /* input scale */, 128 /* kernel zero point */, 0.5f /* kernel scale */,
+    w30.data(), w31.data(),
+    127 /* output zero point */, 0.5f /* output scale */, 0 /* output min */, 255 /* output max */,
+    0 /* flags */,
+    &op0);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #0" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op0, xnn_delete_operator);
+
+  xnn_operator_t op1 = nullptr;
+  status = xnn_create_convolution2d_nhwc_qu8(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    32 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    32 /* input pixel stride */,
+    32 /* output pixel stride */,
+    127 /* input zero point */, 0.5f /* input scale */, 128 /* kernel zero point */, 0.5f /* kernel scale */,
+    w32.data(), w33.data(),
+    127 /* output zero point */, 0.5f /* output scale */, 0 /* output min */, 255 /* output max */,
+    0 /* flags */,
+    &op1);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #1" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op1, xnn_delete_operator);
+
+  xnn_operator_t op2 = nullptr;
+  status = xnn_create_convolution2d_nhwc_qu8(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    32 /* input channels per group */,
+    64 /* output_channels_per_group */,
+    32 /* input pixel stride */,
+    64 /* output pixel stride */,
+    127 /* input zero point */, 0.5f /* input scale */, 128 /* kernel zero point */, 0.5f /* kernel scale */,
+    w34.data(), w35.data(),
+    127 /* output zero point */, 0.5f /* output scale */, 0 /* output min */, 255 /* output max */,
+    0 /* flags */,
+    &op2);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #2" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op2, xnn_delete_operator);
+
+  xnn_operator_t op3 = nullptr;
+  status = xnn_create_convolution2d_nhwc_qu8(
+    0 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 0 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    2 /* subsampling height */, 2 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    64 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    64 /* input pixel stride */,
+    64 /* output pixel stride */,
+    127 /* input zero point */, 0.5f /* input scale */, 128 /* kernel zero point */, 0.5f /* kernel scale */,
+    w36.data(), w37.data(),
+    127 /* output zero point */, 0.5f /* output scale */, 0 /* output min */, 255 /* output max */,
+    0 /* flags */,
+    &op3);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #3" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op3, xnn_delete_operator);
+
+  xnn_operator_t op4 = nullptr;
+  status = xnn_create_convolution2d_nhwc_qu8(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    64 /* input channels per group */,
+    128 /* output_channels_per_group */,
+    64 /* input pixel stride */,
+    128 /* output pixel stride */,
+    127 /* input zero point */, 0.5f /* input scale */, 128 /* kernel zero point */, 0.5f /* kernel scale */,
+    w38.data(), w39.data(),
+    127 /* output zero point */, 0.5f /* output scale */, 0 /* output min */, 255 /* output max */,
+    0 /* flags */,
+    &op4);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #4" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op4, xnn_delete_operator);
+
+  xnn_operator_t op5 = nullptr;
+  status = xnn_create_convolution2d_nhwc_qu8(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    128 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    128 /* input pixel stride */,
+    128 /* output pixel stride */,
+    127 /* input zero point */, 0.5f /* input scale */, 128 /* kernel zero point */, 0.5f /* kernel scale */,
+    w40.data(), w41.data(),
+    127 /* output zero point */, 0.5f /* output scale */, 0 /* output min */, 255 /* output max */,
+    0 /* flags */,
+    &op5);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #5" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op5, xnn_delete_operator);
+
+  xnn_operator_t op6 = nullptr;
+  status = xnn_create_convolution2d_nhwc_qu8(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    128 /* input channels per group */,
+    128 /* output_channels_per_group */,
+    128 /* input pixel stride */,
+    128 /* output pixel stride */,
+    127 /* input zero point */, 0.5f /* input scale */, 128 /* kernel zero point */, 0.5f /* kernel scale */,
+    w42.data(), w43.data(),
+    127 /* output zero point */, 0.5f /* output scale */, 0 /* output min */, 255 /* output max */,
+    0 /* flags */,
+    &op6);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #6" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op6, xnn_delete_operator);
+
+  xnn_operator_t op7 = nullptr;
+  status = xnn_create_convolution2d_nhwc_qu8(
+    0 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 0 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    2 /* subsampling height */, 2 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    128 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    128 /* input pixel stride */,
+    128 /* output pixel stride */,
+    127 /* input zero point */, 0.5f /* input scale */, 128 /* kernel zero point */, 0.5f /* kernel scale */,
+    w44.data(), w45.data(),
+    127 /* output zero point */, 0.5f /* output scale */, 0 /* output min */, 255 /* output max */,
+    0 /* flags */,
+    &op7);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #7" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op7, xnn_delete_operator);
+
+  xnn_operator_t op8 = nullptr;
+  status = xnn_create_convolution2d_nhwc_qu8(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    128 /* input channels per group */,
+    256 /* output_channels_per_group */,
+    128 /* input pixel stride */,
+    256 /* output pixel stride */,
+    127 /* input zero point */, 0.5f /* input scale */, 128 /* kernel zero point */, 0.5f /* kernel scale */,
+    w46.data(), w47.data(),
+    127 /* output zero point */, 0.5f /* output scale */, 0 /* output min */, 255 /* output max */,
+    0 /* flags */,
+    &op8);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #8" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op8, xnn_delete_operator);
+
+  xnn_operator_t op9 = nullptr;
+  status = xnn_create_convolution2d_nhwc_qu8(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    256 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    256 /* input pixel stride */,
+    256 /* output pixel stride */,
+    127 /* input zero point */, 0.5f /* input scale */, 128 /* kernel zero point */, 0.5f /* kernel scale */,
+    w48.data(), w49.data(),
+    127 /* output zero point */, 0.5f /* output scale */, 0 /* output min */, 255 /* output max */,
+    0 /* flags */,
+    &op9);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #9" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op9, xnn_delete_operator);
+
+  xnn_operator_t op10 = nullptr;
+  status = xnn_create_convolution2d_nhwc_qu8(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    256 /* input channels per group */,
+    256 /* output_channels_per_group */,
+    256 /* input pixel stride */,
+    256 /* output pixel stride */,
+    127 /* input zero point */, 0.5f /* input scale */, 128 /* kernel zero point */, 0.5f /* kernel scale */,
+    w50.data(), w51.data(),
+    127 /* output zero point */, 0.5f /* output scale */, 0 /* output min */, 255 /* output max */,
+    0 /* flags */,
+    &op10);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #10" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op10, xnn_delete_operator);
+
+  xnn_operator_t op11 = nullptr;
+  status = xnn_create_convolution2d_nhwc_qu8(
+    0 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 0 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    2 /* subsampling height */, 2 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    256 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    256 /* input pixel stride */,
+    256 /* output pixel stride */,
+    127 /* input zero point */, 0.5f /* input scale */, 128 /* kernel zero point */, 0.5f /* kernel scale */,
+    w52.data(), w53.data(),
+    127 /* output zero point */, 0.5f /* output scale */, 0 /* output min */, 255 /* output max */,
+    0 /* flags */,
+    &op11);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #11" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op11, xnn_delete_operator);
+
+  xnn_operator_t op12 = nullptr;
+  status = xnn_create_convolution2d_nhwc_qu8(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    256 /* input channels per group */,
+    512 /* output_channels_per_group */,
+    256 /* input pixel stride */,
+    512 /* output pixel stride */,
+    127 /* input zero point */, 0.5f /* input scale */, 128 /* kernel zero point */, 0.5f /* kernel scale */,
+    w54.data(), w55.data(),
+    127 /* output zero point */, 0.5f /* output scale */, 0 /* output min */, 255 /* output max */,
+    0 /* flags */,
+    &op12);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #12" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op12, xnn_delete_operator);
+
+  xnn_operator_t op13 = nullptr;
+  status = xnn_create_convolution2d_nhwc_qu8(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    512 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    512 /* input pixel stride */,
+    512 /* output pixel stride */,
+    127 /* input zero point */, 0.5f /* input scale */, 128 /* kernel zero point */, 0.5f /* kernel scale */,
+    w56.data(), w57.data(),
+    127 /* output zero point */, 0.5f /* output scale */, 0 /* output min */, 255 /* output max */,
+    0 /* flags */,
+    &op13);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #13" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op13, xnn_delete_operator);
+
+  xnn_operator_t op14 = nullptr;
+  status = xnn_create_convolution2d_nhwc_qu8(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    512 /* input channels per group */,
+    512 /* output_channels_per_group */,
+    512 /* input pixel stride */,
+    512 /* output pixel stride */,
+    127 /* input zero point */, 0.5f /* input scale */, 128 /* kernel zero point */, 0.5f /* kernel scale */,
+    w58.data(), w59.data(),
+    127 /* output zero point */, 0.5f /* output scale */, 0 /* output min */, 255 /* output max */,
+    0 /* flags */,
+    &op14);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #14" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op14, xnn_delete_operator);
+
+  xnn_operator_t op15 = nullptr;
+  status = xnn_create_convolution2d_nhwc_qu8(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    512 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    512 /* input pixel stride */,
+    512 /* output pixel stride */,
+    127 /* input zero point */, 0.5f /* input scale */, 128 /* kernel zero point */, 0.5f /* kernel scale */,
+    w60.data(), w61.data(),
+    127 /* output zero point */, 0.5f /* output scale */, 0 /* output min */, 255 /* output max */,
+    0 /* flags */,
+    &op15);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #15" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op15, xnn_delete_operator);
+
+  xnn_operator_t op16 = nullptr;
+  status = xnn_create_convolution2d_nhwc_qu8(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    512 /* input channels per group */,
+    512 /* output_channels_per_group */,
+    512 /* input pixel stride */,
+    512 /* output pixel stride */,
+    127 /* input zero point */, 0.5f /* input scale */, 128 /* kernel zero point */, 0.5f /* kernel scale */,
+    w62.data(), w63.data(),
+    127 /* output zero point */, 0.5f /* output scale */, 0 /* output min */, 255 /* output max */,
+    0 /* flags */,
+    &op16);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #16" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op16, xnn_delete_operator);
+
+  xnn_operator_t op17 = nullptr;
+  status = xnn_create_convolution2d_nhwc_qu8(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    512 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    512 /* input pixel stride */,
+    512 /* output pixel stride */,
+    127 /* input zero point */, 0.5f /* input scale */, 128 /* kernel zero point */, 0.5f /* kernel scale */,
+    w64.data(), w65.data(),
+    127 /* output zero point */, 0.5f /* output scale */, 0 /* output min */, 255 /* output max */,
+    0 /* flags */,
+    &op17);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #17" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op17, xnn_delete_operator);
+
+  xnn_operator_t op18 = nullptr;
+  status = xnn_create_convolution2d_nhwc_qu8(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    512 /* input channels per group */,
+    512 /* output_channels_per_group */,
+    512 /* input pixel stride */,
+    512 /* output pixel stride */,
+    127 /* input zero point */, 0.5f /* input scale */, 128 /* kernel zero point */, 0.5f /* kernel scale */,
+    w66.data(), w67.data(),
+    127 /* output zero point */, 0.5f /* output scale */, 0 /* output min */, 255 /* output max */,
+    0 /* flags */,
+    &op18);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #18" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op18, xnn_delete_operator);
+
+  xnn_operator_t op19 = nullptr;
+  status = xnn_create_convolution2d_nhwc_qu8(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    512 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    512 /* input pixel stride */,
+    512 /* output pixel stride */,
+    127 /* input zero point */, 0.5f /* input scale */, 128 /* kernel zero point */, 0.5f /* kernel scale */,
+    w68.data(), w69.data(),
+    127 /* output zero point */, 0.5f /* output scale */, 0 /* output min */, 255 /* output max */,
+    0 /* flags */,
+    &op19);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #19" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op19, xnn_delete_operator);
+
+  xnn_operator_t op20 = nullptr;
+  status = xnn_create_convolution2d_nhwc_qu8(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    512 /* input channels per group */,
+    512 /* output_channels_per_group */,
+    512 /* input pixel stride */,
+    512 /* output pixel stride */,
+    127 /* input zero point */, 0.5f /* input scale */, 128 /* kernel zero point */, 0.5f /* kernel scale */,
+    w70.data(), w71.data(),
+    127 /* output zero point */, 0.5f /* output scale */, 0 /* output min */, 255 /* output max */,
+    0 /* flags */,
+    &op20);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #20" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op20, xnn_delete_operator);
+
+  xnn_operator_t op21 = nullptr;
+  status = xnn_create_convolution2d_nhwc_qu8(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    512 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    512 /* input pixel stride */,
+    512 /* output pixel stride */,
+    127 /* input zero point */, 0.5f /* input scale */, 128 /* kernel zero point */, 0.5f /* kernel scale */,
+    w72.data(), w73.data(),
+    127 /* output zero point */, 0.5f /* output scale */, 0 /* output min */, 255 /* output max */,
+    0 /* flags */,
+    &op21);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #21" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op21, xnn_delete_operator);
+
+  xnn_operator_t op22 = nullptr;
+  status = xnn_create_convolution2d_nhwc_qu8(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    512 /* input channels per group */,
+    512 /* output_channels_per_group */,
+    512 /* input pixel stride */,
+    512 /* output pixel stride */,
+    127 /* input zero point */, 0.5f /* input scale */, 128 /* kernel zero point */, 0.5f /* kernel scale */,
+    w74.data(), w75.data(),
+    127 /* output zero point */, 0.5f /* output scale */, 0 /* output min */, 255 /* output max */,
+    0 /* flags */,
+    &op22);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #22" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op22, xnn_delete_operator);
+
+  xnn_operator_t op23 = nullptr;
+  status = xnn_create_convolution2d_nhwc_qu8(
+    0 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 0 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    2 /* subsampling height */, 2 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    512 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    512 /* input pixel stride */,
+    512 /* output pixel stride */,
+    127 /* input zero point */, 0.5f /* input scale */, 128 /* kernel zero point */, 0.5f /* kernel scale */,
+    w76.data(), w77.data(),
+    127 /* output zero point */, 0.5f /* output scale */, 0 /* output min */, 255 /* output max */,
+    0 /* flags */,
+    &op23);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #23" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op23, xnn_delete_operator);
+
+  xnn_operator_t op24 = nullptr;
+  status = xnn_create_convolution2d_nhwc_qu8(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    512 /* input channels per group */,
+    1024 /* output_channels_per_group */,
+    512 /* input pixel stride */,
+    1024 /* output pixel stride */,
+    127 /* input zero point */, 0.5f /* input scale */, 128 /* kernel zero point */, 0.5f /* kernel scale */,
+    w78.data(), w79.data(),
+    127 /* output zero point */, 0.5f /* output scale */, 0 /* output min */, 255 /* output max */,
+    0 /* flags */,
+    &op24);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #24" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op24, xnn_delete_operator);
+
+  xnn_operator_t op25 = nullptr;
+  status = xnn_create_convolution2d_nhwc_qu8(
+    1 /* top padding */, 1 /* right padding */,
+    1 /* bottom padding */, 1 /* left padding */,
+    3 /* kernel height */, 3 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1024 /* groups */,
+    1 /* input channels per group */,
+    1 /* output_channels_per_group */,
+    1024 /* input pixel stride */,
+    1024 /* output pixel stride */,
+    127 /* input zero point */, 0.5f /* input scale */, 128 /* kernel zero point */, 0.5f /* kernel scale */,
+    w80.data(), w81.data(),
+    127 /* output zero point */, 0.5f /* output scale */, 0 /* output min */, 255 /* output max */,
+    0 /* flags */,
+    &op25);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #25" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op25, xnn_delete_operator);
+
+  xnn_operator_t op26 = nullptr;
+  status = xnn_create_convolution2d_nhwc_qu8(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    1024 /* input channels per group */,
+    1024 /* output_channels_per_group */,
+    1024 /* input pixel stride */,
+    1024 /* output pixel stride */,
+    127 /* input zero point */, 0.5f /* input scale */, 128 /* kernel zero point */, 0.5f /* kernel scale */,
+    w82.data(), w83.data(),
+    127 /* output zero point */, 0.5f /* output scale */, 0 /* output min */, 255 /* output max */,
+    0 /* flags */,
+    &op26);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #26" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op26, xnn_delete_operator);
+
+  xnn_operator_t op27 = nullptr;
+  status = xnn_create_global_average_pooling_nwc_qu8(
+    1024 /* channels */, 1024 /* input stride */, 1024 /* output stride */,
+    127 /* input zero point */, 0.5f /* input scale */,
+    127 /* output zero point */, 0.5f /* output scale */,
+    0 /* output min */, 255 /* output max */,
+    0 /* flags */,
+    &op27);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #27" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op27, xnn_delete_operator);
+
+  xnn_operator_t op28 = nullptr;
+  status = xnn_create_convolution2d_nhwc_qu8(
+    0 /* top padding */, 0 /* right padding */,
+    0 /* bottom padding */, 0 /* left padding */,
+    1 /* kernel height */, 1 /* kernel width */,
+    1 /* subsampling height */, 1 /* subsampling width */,
+    1 /* dilation_height */, 1 /* dilation_width */,
+    1 /* groups */,
+    1024 /* input channels per group */,
+    1001 /* output_channels_per_group */,
+    1024 /* input pixel stride */,
+    1001 /* output pixel stride */,
+    127 /* input zero point */, 0.5f /* input scale */, 128 /* kernel zero point */, 0.5f /* kernel scale */,
+    w84.data(), w85.data(),
+    127 /* output zero point */, 0.5f /* output scale */, 0 /* output min */, 255 /* output max */,
+    0 /* flags */,
+    &op28);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to create operation #28" << std::endl;
+    return ExecutionPlan();
+  }
+  operators.emplace_back(op28, xnn_delete_operator);
+
+
+
+  status = xnn_setup_convolution2d_nhwc_qu8(
+    op0,
+    1 /* batch size */, 224 /* input height */, 224 /* input width */,
+    v0.data() /* input */, v1.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #0" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nhwc_qu8(
+    op1,
+    1 /* batch size */, 112 /* input height */, 112 /* input width */,
+    v1.data() /* input */, v2.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #1" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nhwc_qu8(
+    op2,
+    1 /* batch size */, 112 /* input height */, 112 /* input width */,
+    v2.data() /* input */, v3.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #2" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nhwc_qu8(
+    op3,
+    1 /* batch size */, 112 /* input height */, 112 /* input width */,
+    v3.data() /* input */, v4.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #3" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nhwc_qu8(
+    op4,
+    1 /* batch size */, 56 /* input height */, 56 /* input width */,
+    v4.data() /* input */, v5.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #4" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nhwc_qu8(
+    op5,
+    1 /* batch size */, 56 /* input height */, 56 /* input width */,
+    v5.data() /* input */, v6.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #5" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nhwc_qu8(
+    op6,
+    1 /* batch size */, 56 /* input height */, 56 /* input width */,
+    v6.data() /* input */, v7.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #6" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nhwc_qu8(
+    op7,
+    1 /* batch size */, 56 /* input height */, 56 /* input width */,
+    v7.data() /* input */, v8.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #7" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nhwc_qu8(
+    op8,
+    1 /* batch size */, 28 /* input height */, 28 /* input width */,
+    v8.data() /* input */, v9.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #8" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nhwc_qu8(
+    op9,
+    1 /* batch size */, 28 /* input height */, 28 /* input width */,
+    v9.data() /* input */, v10.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #9" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nhwc_qu8(
+    op10,
+    1 /* batch size */, 28 /* input height */, 28 /* input width */,
+    v10.data() /* input */, v11.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #10" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nhwc_qu8(
+    op11,
+    1 /* batch size */, 28 /* input height */, 28 /* input width */,
+    v11.data() /* input */, v12.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #11" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nhwc_qu8(
+    op12,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v12.data() /* input */, v13.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #12" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nhwc_qu8(
+    op13,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v13.data() /* input */, v14.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #13" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nhwc_qu8(
+    op14,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v14.data() /* input */, v15.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #14" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nhwc_qu8(
+    op15,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v15.data() /* input */, v16.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #15" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nhwc_qu8(
+    op16,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v16.data() /* input */, v17.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #16" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nhwc_qu8(
+    op17,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v17.data() /* input */, v18.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #17" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nhwc_qu8(
+    op18,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v18.data() /* input */, v19.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #18" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nhwc_qu8(
+    op19,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v19.data() /* input */, v20.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #19" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nhwc_qu8(
+    op20,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v20.data() /* input */, v21.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #20" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nhwc_qu8(
+    op21,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v21.data() /* input */, v22.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #21" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nhwc_qu8(
+    op22,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v22.data() /* input */, v23.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #22" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nhwc_qu8(
+    op23,
+    1 /* batch size */, 14 /* input height */, 14 /* input width */,
+    v23.data() /* input */, v24.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #23" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nhwc_qu8(
+    op24,
+    1 /* batch size */, 7 /* input height */, 7 /* input width */,
+    v24.data() /* input */, v25.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #24" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nhwc_qu8(
+    op25,
+    1 /* batch size */, 7 /* input height */, 7 /* input width */,
+    v25.data() /* input */, v26.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #25" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nhwc_qu8(
+    op26,
+    1 /* batch size */, 7 /* input height */, 7 /* input width */,
+    v26.data() /* input */, v27.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #26" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_global_average_pooling_nwc_qu8(
+    op27,
+    1 /* batch size */, 49 /* width */,
+    v27.data() /* input */, v28.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #27" << std::endl;
+    return ExecutionPlan();
+  }
+
+  status = xnn_setup_convolution2d_nhwc_qu8(
+    op28,
+    1 /* batch size */, 1 /* input height */, 1 /* input width */,
+    v28.data() /* input */, v29.data() /* output */,
+    threadpool /* threadpool */);
+  if (status != xnn_status_success) {
+    std::cerr << "failed to setup operation #28" << std::endl;
+    return ExecutionPlan();
+  }
+
+  #pragma clang diagnostic push
+  #pragma clang diagnostic ignored "-Wpessimizing-move"
+  return operators;
+  #pragma clang diagnostic pop
+}
+
+}  // namespace models
diff --git a/scripts/generate-f32-ibilinear-chw.sh b/scripts/generate-f32-ibilinear-chw.sh
index 845a71c..1b14360 100755
--- a/scripts/generate-f32-ibilinear-chw.sh
+++ b/scripts/generate-f32-ibilinear-chw.sh
@@ -13,5 +13,12 @@
 tools/xngen src/f32-ibilinear-chw/wasmsimd.c.in -D PIXEL_TILE=4 -o src/f32-ibilinear-chw/gen/wasmsimd-p4.c
 tools/xngen src/f32-ibilinear-chw/wasmsimd.c.in -D PIXEL_TILE=8 -o src/f32-ibilinear-chw/gen/wasmsimd-p8.c
 
+############################### ARM NEON ##############################
+tools/xngen src/f32-ibilinear-chw/neon.c.in -D PIXEL_TILE=4 -D FMA=0 -o src/f32-ibilinear-chw/gen/neon-p4.c
+tools/xngen src/f32-ibilinear-chw/neon.c.in -D PIXEL_TILE=8 -D FMA=0 -o src/f32-ibilinear-chw/gen/neon-p8.c
+
+tools/xngen src/f32-ibilinear-chw/neon.c.in -D PIXEL_TILE=4 -D FMA=1 -o src/f32-ibilinear-chw/gen/neonfma-p4.c
+tools/xngen src/f32-ibilinear-chw/neon.c.in -D PIXEL_TILE=8 -D FMA=1 -o src/f32-ibilinear-chw/gen/neonfma-p8.c
+
 ################################## Unit tests #################################
 tools/generate-ibilinear-chw-test.py --spec test/f32-ibilinear-chw.yaml --output test/f32-ibilinear-chw.cc
diff --git a/scripts/generate-qs8-gemm.sh b/scripts/generate-qs8-gemm.sh
index 94396fe..5f071ac 100755
--- a/scripts/generate-qs8-gemm.sh
+++ b/scripts/generate-qs8-gemm.sh
@@ -4,11 +4,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-#################################### Scalar ###################################
-### C4 micro-kernels
-tools/xngen src/qs8-gemm/MRxNRc4-scalar.c.in -D MR=8  -D NR=8 -o src/qs8-gemm/gen/8x8c4-minmax-scalar.c
-tools/xngen src/qs8-gemm/MRxNRc4-scalar.c.in -D MR=12 -D NR=4 -o src/qs8-gemm/gen/12x4c4-minmax-scalar.c
-
 ################################## WAsm SIMD ##################################
 ### C8 micro-kernels
 tools/xngen src/qs8-gemm/MRx4c8-wasmsimd.c.in -D MR=1 -D VARIANT=LD64     -o src/qs8-gemm/gen/1x4c8-minmax-wasmsimd-ld64.c
@@ -26,16 +21,77 @@
 ################################### ARM NEON ##################################
 tools/xngen src/qs8-gemm/neon-mlal-lane.c.in -D MR=1 -D NR=8 -o src/qs8-gemm/gen/1x8-minmax-neon-mlal-lane.c
 tools/xngen src/qs8-gemm/neon-mlal-lane.c.in -D MR=2 -D NR=8 -o src/qs8-gemm/gen/2x8-minmax-neon-mlal-lane.c
+tools/xngen src/qs8-gemm/neon-mlal-lane.c.in -D MR=3 -D NR=8 -o src/qs8-gemm/gen/3x8-minmax-neon-mlal-lane.c
+tools/xngen src/qs8-gemm/neon-mlal-lane.c.in -D MR=4 -D NR=8 -o src/qs8-gemm/gen/4x8-minmax-neon-mlal-lane.c
 
 tools/xngen src/qs8-gemm/neon-mlal-lane.c.in -D MR=1 -D NR=16 -o src/qs8-gemm/gen/1x16-minmax-neon-mlal-lane.c
 tools/xngen src/qs8-gemm/neon-mlal-lane.c.in -D MR=2 -D NR=16 -o src/qs8-gemm/gen/2x16-minmax-neon-mlal-lane.c
+tools/xngen src/qs8-gemm/neon-mlal-lane.c.in -D MR=3 -D NR=16 -o src/qs8-gemm/gen/3x16-minmax-neon-mlal-lane.c
+tools/xngen src/qs8-gemm/neon-mlal-lane.c.in -D MR=4 -D NR=16 -o src/qs8-gemm/gen/4x16-minmax-neon-mlal-lane.c
+
+tools/xngen src/qs8-gemm/neon-mull-addw-dup.c.in -D MR=1 -D NR=8 -o src/qs8-gemm/gen/1x8-minmax-neon-mull-addw-dup.c
+tools/xngen src/qs8-gemm/neon-mull-addw-dup.c.in -D MR=2 -D NR=8 -o src/qs8-gemm/gen/2x8-minmax-neon-mull-addw-dup.c
+tools/xngen src/qs8-gemm/neon-mull-addw-dup.c.in -D MR=3 -D NR=8 -o src/qs8-gemm/gen/3x8-minmax-neon-mull-addw-dup.c
+tools/xngen src/qs8-gemm/neon-mull-addw-dup.c.in -D MR=4 -D NR=8 -o src/qs8-gemm/gen/4x8-minmax-neon-mull-addw-dup.c
+
+tools/xngen src/qs8-gemm/neon-mull-addw-dup.c.in -D MR=1 -D NR=16 -o src/qs8-gemm/gen/1x16-minmax-neon-mull-addw-dup.c
+tools/xngen src/qs8-gemm/neon-mull-addw-dup.c.in -D MR=2 -D NR=16 -o src/qs8-gemm/gen/2x16-minmax-neon-mull-addw-dup.c
+tools/xngen src/qs8-gemm/neon-mull-addw-dup.c.in -D MR=3 -D NR=16 -o src/qs8-gemm/gen/3x16-minmax-neon-mull-addw-dup.c
+tools/xngen src/qs8-gemm/neon-mull-addw-dup.c.in -D MR=4 -D NR=16 -o src/qs8-gemm/gen/4x16-minmax-neon-mull-addw-dup.c
+
+### C2 micro-kernels
+tools/xngen src/qs8-gemm/c2-neon-mull-padal-dup.c.in -D MR=1 -D NR=8  -D MLA=0 -o src/qs8-gemm/gen/1x8c2-minmax-neon-mull-padal-dup.c
+tools/xngen src/qs8-gemm/c2-neon-mull-padal-dup.c.in -D MR=2 -D NR=8  -D MLA=0 -o src/qs8-gemm/gen/2x8c2-minmax-neon-mull-padal-dup.c
+tools/xngen src/qs8-gemm/c2-neon-mull-padal-dup.c.in -D MR=3 -D NR=8  -D MLA=0 -o src/qs8-gemm/gen/3x8c2-minmax-neon-mull-padal-dup.c
+tools/xngen src/qs8-gemm/c2-neon-mull-padal-dup.c.in -D MR=4 -D NR=8  -D MLA=0 -o src/qs8-gemm/gen/4x8c2-minmax-neon-mull-padal-dup.c
+tools/xngen src/qs8-gemm/c2-neon-mull-padal-dup.c.in -D MR=1 -D NR=16 -D MLA=0 -o src/qs8-gemm/gen/1x16c2-minmax-neon-mull-padal-dup.c
+tools/xngen src/qs8-gemm/c2-neon-mull-padal-dup.c.in -D MR=2 -D NR=16 -D MLA=0 -o src/qs8-gemm/gen/2x16c2-minmax-neon-mull-padal-dup.c
+tools/xngen src/qs8-gemm/c2-neon-mull-padal-dup.c.in -D MR=3 -D NR=16 -D MLA=0 -o src/qs8-gemm/gen/3x16c2-minmax-neon-mull-padal-dup.c
+tools/xngen src/qs8-gemm/c2-neon-mull-padal-dup.c.in -D MR=4 -D NR=16 -D MLA=0 -o src/qs8-gemm/gen/4x16c2-minmax-neon-mull-padal-dup.c
+
+tools/xngen src/qs8-gemm/c2-neon-mull-padal-dup.c.in -D MR=1 -D NR=8  -D MLA=1 -o src/qs8-gemm/gen/1x8c2-minmax-neon-mlal-padal-dup.c
+tools/xngen src/qs8-gemm/c2-neon-mull-padal-dup.c.in -D MR=2 -D NR=8  -D MLA=1 -o src/qs8-gemm/gen/2x8c2-minmax-neon-mlal-padal-dup.c
+tools/xngen src/qs8-gemm/c2-neon-mull-padal-dup.c.in -D MR=3 -D NR=8  -D MLA=1 -o src/qs8-gemm/gen/3x8c2-minmax-neon-mlal-padal-dup.c
+tools/xngen src/qs8-gemm/c2-neon-mull-padal-dup.c.in -D MR=4 -D NR=8  -D MLA=1 -o src/qs8-gemm/gen/4x8c2-minmax-neon-mlal-padal-dup.c
+tools/xngen src/qs8-gemm/c2-neon-mull-padal-dup.c.in -D MR=1 -D NR=16 -D MLA=1 -o src/qs8-gemm/gen/1x16c2-minmax-neon-mlal-padal-dup.c
+tools/xngen src/qs8-gemm/c2-neon-mull-padal-dup.c.in -D MR=2 -D NR=16 -D MLA=1 -o src/qs8-gemm/gen/2x16c2-minmax-neon-mlal-padal-dup.c
+tools/xngen src/qs8-gemm/c2-neon-mull-padal-dup.c.in -D MR=3 -D NR=16 -D MLA=1 -o src/qs8-gemm/gen/3x16c2-minmax-neon-mlal-padal-dup.c
+tools/xngen src/qs8-gemm/c2-neon-mull-padal-dup.c.in -D MR=4 -D NR=16 -D MLA=1 -o src/qs8-gemm/gen/4x16c2-minmax-neon-mlal-padal-dup.c
+
+### C8 micro-kernels
+tools/xngen src/qs8-gemm/c8-neon-mull-padal.c.in -D MR=1 -D NR=8  -D MLA=0 -o src/qs8-gemm/gen/1x8c8-minmax-neon-mull-padal.c
+tools/xngen src/qs8-gemm/c8-neon-mull-padal.c.in -D MR=2 -D NR=8  -D MLA=0 -o src/qs8-gemm/gen/2x8c8-minmax-neon-mull-padal.c
+tools/xngen src/qs8-gemm/c8-neon-mull-padal.c.in -D MR=3 -D NR=8  -D MLA=0 -o src/qs8-gemm/gen/3x8c8-minmax-neon-mull-padal.c
+tools/xngen src/qs8-gemm/c8-neon-mull-padal.c.in -D MR=4 -D NR=8  -D MLA=0 -o src/qs8-gemm/gen/4x8c8-minmax-neon-mull-padal.c
+tools/xngen src/qs8-gemm/c8-neon-mull-padal.c.in -D MR=1 -D NR=16 -D MLA=0 -o src/qs8-gemm/gen/1x16c8-minmax-neon-mull-padal.c
+tools/xngen src/qs8-gemm/c8-neon-mull-padal.c.in -D MR=2 -D NR=16 -D MLA=0 -o src/qs8-gemm/gen/2x16c8-minmax-neon-mull-padal.c
+tools/xngen src/qs8-gemm/c8-neon-mull-padal.c.in -D MR=3 -D NR=16 -D MLA=0 -o src/qs8-gemm/gen/3x16c8-minmax-neon-mull-padal.c
+tools/xngen src/qs8-gemm/c8-neon-mull-padal.c.in -D MR=4 -D NR=16 -D MLA=0 -o src/qs8-gemm/gen/4x16c8-minmax-neon-mull-padal.c
+
+tools/xngen src/qs8-gemm/c8-neon-mull-padal.c.in -D MR=1 -D NR=8  -D MLA=1 -o src/qs8-gemm/gen/1x8c8-minmax-neon-mlal-padal.c
+tools/xngen src/qs8-gemm/c8-neon-mull-padal.c.in -D MR=2 -D NR=8  -D MLA=1 -o src/qs8-gemm/gen/2x8c8-minmax-neon-mlal-padal.c
+tools/xngen src/qs8-gemm/c8-neon-mull-padal.c.in -D MR=3 -D NR=8  -D MLA=1 -o src/qs8-gemm/gen/3x8c8-minmax-neon-mlal-padal.c
+tools/xngen src/qs8-gemm/c8-neon-mull-padal.c.in -D MR=4 -D NR=8  -D MLA=1 -o src/qs8-gemm/gen/4x8c8-minmax-neon-mlal-padal.c
+tools/xngen src/qs8-gemm/c8-neon-mull-padal.c.in -D MR=1 -D NR=16 -D MLA=1 -o src/qs8-gemm/gen/1x16c8-minmax-neon-mlal-padal.c
+tools/xngen src/qs8-gemm/c8-neon-mull-padal.c.in -D MR=2 -D NR=16 -D MLA=1 -o src/qs8-gemm/gen/2x16c8-minmax-neon-mlal-padal.c
+tools/xngen src/qs8-gemm/c8-neon-mull-padal.c.in -D MR=3 -D NR=16 -D MLA=1 -o src/qs8-gemm/gen/3x16c8-minmax-neon-mlal-padal.c
+tools/xngen src/qs8-gemm/c8-neon-mull-padal.c.in -D MR=4 -D NR=16 -D MLA=1 -o src/qs8-gemm/gen/4x16c8-minmax-neon-mlal-padal.c
+
+### C16 micro-kernels
+tools/xngen src/qs8-gemm/c16-neon-mlal-padal.c.in -D MR=1 -D NR=8  -o src/qs8-gemm/gen/1x8c16-minmax-neon-mlal-padal.c
+tools/xngen src/qs8-gemm/c16-neon-mlal-padal.c.in -D MR=2 -D NR=8  -o src/qs8-gemm/gen/2x8c16-minmax-neon-mlal-padal.c
+tools/xngen src/qs8-gemm/c16-neon-mlal-padal.c.in -D MR=3 -D NR=8  -o src/qs8-gemm/gen/3x8c16-minmax-neon-mlal-padal.c
+tools/xngen src/qs8-gemm/c16-neon-mlal-padal.c.in -D MR=4 -D NR=8  -o src/qs8-gemm/gen/4x8c16-minmax-neon-mlal-padal.c
+tools/xngen src/qs8-gemm/c16-neon-mlal-padal.c.in -D MR=1 -D NR=16 -o src/qs8-gemm/gen/1x16c16-minmax-neon-mlal-padal.c
+tools/xngen src/qs8-gemm/c16-neon-mlal-padal.c.in -D MR=2 -D NR=16 -o src/qs8-gemm/gen/2x16c16-minmax-neon-mlal-padal.c
+tools/xngen src/qs8-gemm/c16-neon-mlal-padal.c.in -D MR=3 -D NR=16 -o src/qs8-gemm/gen/3x16c16-minmax-neon-mlal-padal.c
+tools/xngen src/qs8-gemm/c16-neon-mlal-padal.c.in -D MR=4 -D NR=16 -o src/qs8-gemm/gen/4x16c16-minmax-neon-mlal-padal.c
 
 ### C4 micro-kernels
 tools/xngen src/qs8-gemm/MRxNRc4-neondot.c.in -D MR=1  -D NR=8  -o src/qs8-gemm/gen/1x8c4-minmax-neondot.c
 tools/xngen src/qs8-gemm/MRxNRc4-neondot.c.in -D MR=4  -D NR=8  -o src/qs8-gemm/gen/4x8c4-minmax-neondot.c
 tools/xngen src/qs8-gemm/MRxNRc4-neondot.c.in -D MR=6  -D NR=8  -o src/qs8-gemm/gen/6x8c4-minmax-neondot.c
 tools/xngen src/qs8-gemm/MRxNRc4-neondot.c.in -D MR=8  -D NR=8  -o src/qs8-gemm/gen/8x8c4-minmax-neondot.c
-tools/xngen src/qs8-gemm/MRxNRc4-neondot.c.in -D MR=12 -D NR=8  -o src/qs8-gemm/gen/12x8c4-minmax-neondot.c
 tools/xngen src/qs8-gemm/MRxNRc4-neondot.c.in -D MR=1  -D NR=16 -o src/qs8-gemm/gen/1x16c4-minmax-neondot.c
 tools/xngen src/qs8-gemm/MRxNRc4-neondot.c.in -D MR=4  -D NR=16 -o src/qs8-gemm/gen/4x16c4-minmax-neondot.c
 tools/xngen src/qs8-gemm/MRxNRc4-neondot.c.in -D MR=6  -D NR=16 -o src/qs8-gemm/gen/6x16c4-minmax-neondot.c
diff --git a/scripts/generate-qs8-igemm.sh b/scripts/generate-qs8-igemm.sh
index 215976a..e2a1960 100755
--- a/scripts/generate-qs8-igemm.sh
+++ b/scripts/generate-qs8-igemm.sh
@@ -17,16 +17,77 @@
 ################################### ARM NEON ##################################
 tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=1 -D NR=8 -o src/qs8-igemm/gen/1x8-minmax-neon-mlal-lane.c
 tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=2 -D NR=8 -o src/qs8-igemm/gen/2x8-minmax-neon-mlal-lane.c
+tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=3 -D NR=8 -o src/qs8-igemm/gen/3x8-minmax-neon-mlal-lane.c
+tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=4 -D NR=8 -o src/qs8-igemm/gen/4x8-minmax-neon-mlal-lane.c
 
 tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=1 -D NR=16 -o src/qs8-igemm/gen/1x16-minmax-neon-mlal-lane.c
 tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=2 -D NR=16 -o src/qs8-igemm/gen/2x16-minmax-neon-mlal-lane.c
+tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=3 -D NR=16 -o src/qs8-igemm/gen/3x16-minmax-neon-mlal-lane.c
+tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=4 -D NR=16 -o src/qs8-igemm/gen/4x16-minmax-neon-mlal-lane.c
+
+tools/xngen src/qs8-igemm/neon-mull-addw-dup.c.in -D MR=1 -D NR=8 -o src/qs8-igemm/gen/1x8-minmax-neon-mull-addw-dup.c
+tools/xngen src/qs8-igemm/neon-mull-addw-dup.c.in -D MR=2 -D NR=8 -o src/qs8-igemm/gen/2x8-minmax-neon-mull-addw-dup.c
+tools/xngen src/qs8-igemm/neon-mull-addw-dup.c.in -D MR=3 -D NR=8 -o src/qs8-igemm/gen/3x8-minmax-neon-mull-addw-dup.c
+tools/xngen src/qs8-igemm/neon-mull-addw-dup.c.in -D MR=4 -D NR=8 -o src/qs8-igemm/gen/4x8-minmax-neon-mull-addw-dup.c
+
+tools/xngen src/qs8-igemm/neon-mull-addw-dup.c.in -D MR=1 -D NR=16 -o src/qs8-igemm/gen/1x16-minmax-neon-mull-addw-dup.c
+tools/xngen src/qs8-igemm/neon-mull-addw-dup.c.in -D MR=2 -D NR=16 -o src/qs8-igemm/gen/2x16-minmax-neon-mull-addw-dup.c
+tools/xngen src/qs8-igemm/neon-mull-addw-dup.c.in -D MR=3 -D NR=16 -o src/qs8-igemm/gen/3x16-minmax-neon-mull-addw-dup.c
+tools/xngen src/qs8-igemm/neon-mull-addw-dup.c.in -D MR=4 -D NR=16 -o src/qs8-igemm/gen/4x16-minmax-neon-mull-addw-dup.c
+
+### C2 micro-kernels
+tools/xngen src/qs8-igemm/c2-neon-mull-padal-dup.c.in -D MR=1 -D NR=8  -D MLA=0 -o src/qs8-igemm/gen/1x8c2-minmax-neon-mull-padal-dup.c
+tools/xngen src/qs8-igemm/c2-neon-mull-padal-dup.c.in -D MR=2 -D NR=8  -D MLA=0 -o src/qs8-igemm/gen/2x8c2-minmax-neon-mull-padal-dup.c
+tools/xngen src/qs8-igemm/c2-neon-mull-padal-dup.c.in -D MR=3 -D NR=8  -D MLA=0 -o src/qs8-igemm/gen/3x8c2-minmax-neon-mull-padal-dup.c
+tools/xngen src/qs8-igemm/c2-neon-mull-padal-dup.c.in -D MR=4 -D NR=8  -D MLA=0 -o src/qs8-igemm/gen/4x8c2-minmax-neon-mull-padal-dup.c
+tools/xngen src/qs8-igemm/c2-neon-mull-padal-dup.c.in -D MR=1 -D NR=16 -D MLA=0 -o src/qs8-igemm/gen/1x16c2-minmax-neon-mull-padal-dup.c
+tools/xngen src/qs8-igemm/c2-neon-mull-padal-dup.c.in -D MR=2 -D NR=16 -D MLA=0 -o src/qs8-igemm/gen/2x16c2-minmax-neon-mull-padal-dup.c
+tools/xngen src/qs8-igemm/c2-neon-mull-padal-dup.c.in -D MR=3 -D NR=16 -D MLA=0 -o src/qs8-igemm/gen/3x16c2-minmax-neon-mull-padal-dup.c
+tools/xngen src/qs8-igemm/c2-neon-mull-padal-dup.c.in -D MR=4 -D NR=16 -D MLA=0 -o src/qs8-igemm/gen/4x16c2-minmax-neon-mull-padal-dup.c
+
+tools/xngen src/qs8-igemm/c2-neon-mull-padal-dup.c.in -D MR=1 -D NR=8  -D MLA=1 -o src/qs8-igemm/gen/1x8c2-minmax-neon-mlal-padal-dup.c
+tools/xngen src/qs8-igemm/c2-neon-mull-padal-dup.c.in -D MR=2 -D NR=8  -D MLA=1 -o src/qs8-igemm/gen/2x8c2-minmax-neon-mlal-padal-dup.c
+tools/xngen src/qs8-igemm/c2-neon-mull-padal-dup.c.in -D MR=3 -D NR=8  -D MLA=1 -o src/qs8-igemm/gen/3x8c2-minmax-neon-mlal-padal-dup.c
+tools/xngen src/qs8-igemm/c2-neon-mull-padal-dup.c.in -D MR=4 -D NR=8  -D MLA=1 -o src/qs8-igemm/gen/4x8c2-minmax-neon-mlal-padal-dup.c
+tools/xngen src/qs8-igemm/c2-neon-mull-padal-dup.c.in -D MR=1 -D NR=16 -D MLA=1 -o src/qs8-igemm/gen/1x16c2-minmax-neon-mlal-padal-dup.c
+tools/xngen src/qs8-igemm/c2-neon-mull-padal-dup.c.in -D MR=2 -D NR=16 -D MLA=1 -o src/qs8-igemm/gen/2x16c2-minmax-neon-mlal-padal-dup.c
+tools/xngen src/qs8-igemm/c2-neon-mull-padal-dup.c.in -D MR=3 -D NR=16 -D MLA=1 -o src/qs8-igemm/gen/3x16c2-minmax-neon-mlal-padal-dup.c
+tools/xngen src/qs8-igemm/c2-neon-mull-padal-dup.c.in -D MR=4 -D NR=16 -D MLA=1 -o src/qs8-igemm/gen/4x16c2-minmax-neon-mlal-padal-dup.c
+
+### C8 micro-kernels
+tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=1 -D NR=8  -D MLA=0 -o src/qs8-igemm/gen/1x8c8-minmax-neon-mull-padal.c
+tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=2 -D NR=8  -D MLA=0 -o src/qs8-igemm/gen/2x8c8-minmax-neon-mull-padal.c
+tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=3 -D NR=8  -D MLA=0 -o src/qs8-igemm/gen/3x8c8-minmax-neon-mull-padal.c
+tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=4 -D NR=8  -D MLA=0 -o src/qs8-igemm/gen/4x8c8-minmax-neon-mull-padal.c
+tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=1 -D NR=16 -D MLA=0 -o src/qs8-igemm/gen/1x16c8-minmax-neon-mull-padal.c
+tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=2 -D NR=16 -D MLA=0 -o src/qs8-igemm/gen/2x16c8-minmax-neon-mull-padal.c
+tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=3 -D NR=16 -D MLA=0 -o src/qs8-igemm/gen/3x16c8-minmax-neon-mull-padal.c
+tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=4 -D NR=16 -D MLA=0 -o src/qs8-igemm/gen/4x16c8-minmax-neon-mull-padal.c
+
+tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=1 -D NR=8  -D MLA=1 -o src/qs8-igemm/gen/1x8c8-minmax-neon-mlal-padal.c
+tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=2 -D NR=8  -D MLA=1 -o src/qs8-igemm/gen/2x8c8-minmax-neon-mlal-padal.c
+tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=3 -D NR=8  -D MLA=1 -o src/qs8-igemm/gen/3x8c8-minmax-neon-mlal-padal.c
+tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=4 -D NR=8  -D MLA=1 -o src/qs8-igemm/gen/4x8c8-minmax-neon-mlal-padal.c
+tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=1 -D NR=16 -D MLA=1 -o src/qs8-igemm/gen/1x16c8-minmax-neon-mlal-padal.c
+tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=2 -D NR=16 -D MLA=1 -o src/qs8-igemm/gen/2x16c8-minmax-neon-mlal-padal.c
+tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=3 -D NR=16 -D MLA=1 -o src/qs8-igemm/gen/3x16c8-minmax-neon-mlal-padal.c
+tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=4 -D NR=16 -D MLA=1 -o src/qs8-igemm/gen/4x16c8-minmax-neon-mlal-padal.c
+
+### C16 micro-kernels
+tools/xngen src/qs8-igemm/c16-neon-mlal-padal.c.in -D MR=1 -D NR=8  -o src/qs8-igemm/gen/1x8c16-minmax-neon-mlal-padal.c
+tools/xngen src/qs8-igemm/c16-neon-mlal-padal.c.in -D MR=2 -D NR=8  -o src/qs8-igemm/gen/2x8c16-minmax-neon-mlal-padal.c
+tools/xngen src/qs8-igemm/c16-neon-mlal-padal.c.in -D MR=3 -D NR=8  -o src/qs8-igemm/gen/3x8c16-minmax-neon-mlal-padal.c
+tools/xngen src/qs8-igemm/c16-neon-mlal-padal.c.in -D MR=4 -D NR=8  -o src/qs8-igemm/gen/4x8c16-minmax-neon-mlal-padal.c
+tools/xngen src/qs8-igemm/c16-neon-mlal-padal.c.in -D MR=1 -D NR=16 -o src/qs8-igemm/gen/1x16c16-minmax-neon-mlal-padal.c
+tools/xngen src/qs8-igemm/c16-neon-mlal-padal.c.in -D MR=2 -D NR=16 -o src/qs8-igemm/gen/2x16c16-minmax-neon-mlal-padal.c
+tools/xngen src/qs8-igemm/c16-neon-mlal-padal.c.in -D MR=3 -D NR=16 -o src/qs8-igemm/gen/3x16c16-minmax-neon-mlal-padal.c
+tools/xngen src/qs8-igemm/c16-neon-mlal-padal.c.in -D MR=4 -D NR=16 -o src/qs8-igemm/gen/4x16c16-minmax-neon-mlal-padal.c
 
 ### C4 micro-kernels
 tools/xngen src/qs8-igemm/MRxNRc4-neondot.c.in -D MR=1  -D NR=8  -o src/qs8-igemm/gen/1x8c4-minmax-neondot.c
 tools/xngen src/qs8-igemm/MRxNRc4-neondot.c.in -D MR=4  -D NR=8  -o src/qs8-igemm/gen/4x8c4-minmax-neondot.c
 tools/xngen src/qs8-igemm/MRxNRc4-neondot.c.in -D MR=6  -D NR=8  -o src/qs8-igemm/gen/6x8c4-minmax-neondot.c
 tools/xngen src/qs8-igemm/MRxNRc4-neondot.c.in -D MR=8  -D NR=8  -o src/qs8-igemm/gen/8x8c4-minmax-neondot.c
-tools/xngen src/qs8-igemm/MRxNRc4-neondot.c.in -D MR=12 -D NR=8  -o src/qs8-igemm/gen/12x8c4-minmax-neondot.c
 tools/xngen src/qs8-igemm/MRxNRc4-neondot.c.in -D MR=1  -D NR=16 -o src/qs8-igemm/gen/1x16c4-minmax-neondot.c
 tools/xngen src/qs8-igemm/MRxNRc4-neondot.c.in -D MR=4  -D NR=16 -o src/qs8-igemm/gen/4x16c4-minmax-neondot.c
 tools/xngen src/qs8-igemm/MRxNRc4-neondot.c.in -D MR=6  -D NR=16 -o src/qs8-igemm/gen/6x16c4-minmax-neondot.c
diff --git a/scripts/generate-qu8-gemm.sh b/scripts/generate-qu8-gemm.sh
deleted file mode 100755
index babcb79..0000000
--- a/scripts/generate-qu8-gemm.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/sh
-# Copyright 2020 Google LLC
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-#################################### Scalar ###################################
-### C4 micro-kernels
-tools/xngen src/qu8-gemm/MRxNRc4-minmax-scalar.c.in -D MR=8  -D NR=8 -o src/qu8-gemm/gen/8x8c4-minmax-scalar.c
-tools/xngen src/qu8-gemm/MRxNRc4-minmax-scalar.c.in -D MR=12 -D NR=4 -o src/qu8-gemm/gen/12x4c4-minmax-scalar.c
diff --git a/src/f32-dwconv2d-chw/5x5p2-neon.c.in b/src/f32-dwconv2d-chw/5x5p2-neon.c.in
index e38dc13..7401d5e 100644
--- a/src/f32-dwconv2d-chw/5x5p2-neon.c.in
+++ b/src/f32-dwconv2d-chw/5x5p2-neon.c.in
@@ -325,7 +325,7 @@
       $for M in range(ROW_TILE):
         float32x4_t vo${M} = vmaxq_f32(vo${M}p0, vmin);
 
-      $for M in range(ROW_TILE):        
+      $for M in range(ROW_TILE):
         vo${M} = vminq_f32(vo${M}, vmax);
 
       $for M in reversed(range(ROW_TILE)):
diff --git a/src/f32-ibilinear-chw/gen/neon-p4.c b/src/f32-ibilinear-chw/gen/neon-p4.c
new file mode 100644
index 0000000..fc25f32
--- /dev/null
+++ b/src/f32-ibilinear-chw/gen/neon-p4.c
@@ -0,0 +1,167 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-ibilinear-chw/neon.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/ibilinear.h>
+
+
+void xnn_f32_ibilinear_chw_ukernel__neon_p4(
+    size_t output_pixels,
+    size_t channels,
+    const float**restrict input,
+    size_t input_offset,
+    const float*restrict weights,
+    float*restrict output,
+    size_t input_increment) XNN_DISABLE_TSAN
+{
+  assert(output_pixels != 0);
+  assert(channels != 0);
+  assert(input_increment % sizeof(float) == 0);
+
+  do {
+    const float** i = input;
+    const float* w = weights;
+    size_t p = output_pixels;
+
+    for (; p >= 4; p -= 4) {
+      const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset);
+      const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset);
+      const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset);
+      const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset);
+      const float* itl2 = (const float*) ((uintptr_t) i[4] + input_offset);
+      const float* ibl2 = (const float*) ((uintptr_t) i[5] + input_offset);
+      const float* itl3 = (const float*) ((uintptr_t) i[6] + input_offset);
+      const float* ibl3 = (const float*) ((uintptr_t) i[7] + input_offset);
+      i += 8;
+
+      const float32x4x2_t vw = vld2q_f32(w);
+      w += 8;
+
+      const float32x2_t vtltr0 = vld1_f32(itl0);
+      const float32x2_t vblbr0 = vld1_f32(ibl0);
+      const float32x2_t vtltr1 = vld1_f32(itl1);
+      const float32x2_t vblbr1 = vld1_f32(ibl1);
+      const float32x2_t vtltr2 = vld1_f32(itl2);
+      const float32x2_t vblbr2 = vld1_f32(ibl2);
+      const float32x2_t vtltr3 = vld1_f32(itl3);
+      const float32x2_t vblbr3 = vld1_f32(ibl3);
+
+      const float32x4_t valphah = vw.val[0];
+      const float32x4_t valphav = vw.val[1];
+
+      const float32x4_t vtltr01 = vcombine_f32(vtltr0, vtltr1);
+      const float32x4_t vblbr01 = vcombine_f32(vblbr0, vblbr1);
+      const float32x4_t vtltr23 = vcombine_f32(vtltr2, vtltr3);
+      const float32x4_t vblbr23 = vcombine_f32(vblbr2, vblbr3);
+
+      const float32x4_t vldrd01 = vsubq_f32(vblbr01, vtltr01);
+      const float32x4_t vldrd23 = vsubq_f32(vblbr23, vtltr23);
+
+      const float32x4x2_t vld_t = vuzpq_f32(vldrd01, vldrd23);
+      const float32x4_t vld = vld_t.val[0];
+      const float32x4_t vrd = vld_t.val[1];
+
+      const float32x4x2_t vtl_t = vuzpq_f32(vtltr01, vtltr23);
+      const float32x4_t vtl = vtl_t.val[0];
+      const float32x4_t vtr = vtl_t.val[1];
+
+      const float32x4_t vl = vmlaq_f32(vtl, vld, valphav);
+      const float32x4_t vr = vmlaq_f32(vtr, vrd, valphav);
+
+      const float32x4_t vd = vsubq_f32(vr, vl);
+      const float32x4_t vo = vmlaq_f32(vl, vd, valphah);
+
+      vst1q_f32(output, vo);
+      output += 4;
+    }
+
+    if XNN_UNLIKELY(p != 0) {
+      if (p & 2) {
+        const float32x2x2_t vw = vld2_f32(w);
+        w += 4;
+
+        const float32x2_t valphah = vw.val[0];
+        const float32x2_t valphav = vw.val[1];
+
+        const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset);
+        const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset);
+        const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset);
+        const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset);
+        i += 4;
+
+        const float32x2_t vtltr0 = vld1_f32(itl0);
+        const float32x2_t vblbr0 = vld1_f32(ibl0);
+        const float32x2_t vtltr1 = vld1_f32(itl1);
+        const float32x2_t vblbr1 = vld1_f32(ibl1);
+
+        const float32x2_t vldrd0 = vsub_f32(vblbr0, vtltr0);
+        const float32x2_t vldrd1 = vsub_f32(vblbr1, vtltr1);
+
+        const float32x2x2_t vld_t = vuzp_f32(vldrd0, vldrd1);
+        const float32x2_t vld = vld_t.val[0];
+        const float32x2_t vrd = vld_t.val[1];
+
+        const float32x2x2_t vtl_t = vuzp_f32(vtltr0, vtltr1);
+        const float32x2_t vtl = vtl_t.val[0];
+        const float32x2_t vtr = vtl_t.val[1];
+
+        const float32x2_t vl = vmla_f32(vtl, vld, valphav);
+        const float32x2_t vr = vmla_f32(vtr, vrd, valphav);
+
+        const float32x2_t vd = vsub_f32(vr, vl);
+        const float32x2_t vo = vmla_f32(vl, vd, valphah);
+
+        vst1_f32(output, vo);
+        output += 2;
+      }
+
+      if (p & 1) {
+        // We are computing the following formula:
+        //   result = (1 - alpha_h) * (1 - alpha_v) * top_left +
+        //                 alpha_h  * (1 - alpha_v) * top_right +
+        //            (1 - alpha_h) *      alpha_v  * bottom_left +
+        //                 alpha_h  *      alpha_v  * bottom_right.
+        //
+        // Rearranging gives
+        //   result =    left + alpha_h * (right        - left),
+        // where
+        //   left =  top_left + alpha_v * (bottom_left  - top_left),
+        //  right = top_right + alpha_v * (bottom_right - top_right).
+
+        const float alphah = *w;
+        const float32x2_t valphav = vld1_dup_f32(w + 1);
+        w += 2;
+
+        const float* itl = (const float*) ((uintptr_t) i[0] + input_offset);
+        const float* ibl = (const float*) ((uintptr_t) i[1] + input_offset);
+        i += 2;
+
+        const float32x2_t vtltr = vld1_f32(itl);
+        const float32x2_t vblbr = vld1_f32(ibl);
+
+        // Compute at once
+        //    left_diff = bottom_left  - top_left
+        //   right_diff = bottom_right - top_right
+        const float32x2_t vldrd = vsub_f32(vblbr, vtltr);
+        const float32x2_t vlr = vmla_f32(vtltr, vldrd, valphav);
+
+        // Extract them and compute the result.
+        const float l = vget_lane_f32(vlr, 0);
+        const float r = vget_lane_f32(vlr, 1);
+
+        *output++ = l + alphah * (r - l);
+      }
+    }
+
+    input_offset += input_increment;
+  } while (--channels != 0);
+}
diff --git a/src/f32-ibilinear-chw/gen/neon-p8.c b/src/f32-ibilinear-chw/gen/neon-p8.c
new file mode 100644
index 0000000..7e6ffad
--- /dev/null
+++ b/src/f32-ibilinear-chw/gen/neon-p8.c
@@ -0,0 +1,255 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-ibilinear-chw/neon.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/ibilinear.h>
+
+
+void xnn_f32_ibilinear_chw_ukernel__neon_p8(
+    size_t output_pixels,
+    size_t channels,
+    const float**restrict input,
+    size_t input_offset,
+    const float*restrict weights,
+    float*restrict output,
+    size_t input_increment) XNN_DISABLE_TSAN
+{
+  assert(output_pixels != 0);
+  assert(channels != 0);
+  assert(input_increment % sizeof(float) == 0);
+
+  do {
+    const float** i = input;
+    const float* w = weights;
+    size_t p = output_pixels;
+    for (; p >= 8; p -= 8) {
+      const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset);
+      const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset);
+      const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset);
+      const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset);
+      const float* itl2 = (const float*) ((uintptr_t) i[4] + input_offset);
+      const float* ibl2 = (const float*) ((uintptr_t) i[5] + input_offset);
+      const float* itl3 = (const float*) ((uintptr_t) i[6] + input_offset);
+      const float* ibl3 = (const float*) ((uintptr_t) i[7] + input_offset);
+      const float* itl4 = (const float*) ((uintptr_t) i[8] + input_offset);
+      const float* ibl4 = (const float*) ((uintptr_t) i[9] + input_offset);
+      const float* itl5 = (const float*) ((uintptr_t) i[10] + input_offset);
+      const float* ibl5 = (const float*) ((uintptr_t) i[11] + input_offset);
+      const float* itl6 = (const float*) ((uintptr_t) i[12] + input_offset);
+      const float* ibl6 = (const float*) ((uintptr_t) i[13] + input_offset);
+      const float* itl7 = (const float*) ((uintptr_t) i[14] + input_offset);
+      const float* ibl7 = (const float*) ((uintptr_t) i[15] + input_offset);
+      i += 2 * 8;
+
+      const float32x4x2_t vw0123 = vld2q_f32(w + 0);
+      const float32x4x2_t vw4567 = vld2q_f32(w + 8);
+      w += 2 * 8;
+
+      const float32x2_t vtltr0 = vld1_f32(itl0);
+      const float32x2_t vblbr0 = vld1_f32(ibl0);
+      const float32x2_t vtltr1 = vld1_f32(itl1);
+      const float32x2_t vblbr1 = vld1_f32(ibl1);
+      const float32x2_t vtltr2 = vld1_f32(itl2);
+      const float32x2_t vblbr2 = vld1_f32(ibl2);
+      const float32x2_t vtltr3 = vld1_f32(itl3);
+      const float32x2_t vblbr3 = vld1_f32(ibl3);
+      const float32x2_t vtltr4 = vld1_f32(itl4);
+      const float32x2_t vblbr4 = vld1_f32(ibl4);
+      const float32x2_t vtltr5 = vld1_f32(itl5);
+      const float32x2_t vblbr5 = vld1_f32(ibl5);
+      const float32x2_t vtltr6 = vld1_f32(itl6);
+      const float32x2_t vblbr6 = vld1_f32(ibl6);
+      const float32x2_t vtltr7 = vld1_f32(itl7);
+      const float32x2_t vblbr7 = vld1_f32(ibl7);
+
+      const float32x4_t valphah0123 = vw0123.val[0];
+      const float32x4_t valphav0123 = vw0123.val[1];
+      const float32x4_t valphah4567 = vw4567.val[0];
+      const float32x4_t valphav4567 = vw4567.val[1];
+
+      const float32x4_t vtltr01 = vcombine_f32(vtltr0, vtltr1);
+      const float32x4_t vblbr01 = vcombine_f32(vblbr0, vblbr1);
+      const float32x4_t vtltr23 = vcombine_f32(vtltr2, vtltr3);
+      const float32x4_t vblbr23 = vcombine_f32(vblbr2, vblbr3);
+      const float32x4_t vtltr45 = vcombine_f32(vtltr4, vtltr5);
+      const float32x4_t vblbr45 = vcombine_f32(vblbr4, vblbr5);
+      const float32x4_t vtltr67 = vcombine_f32(vtltr6, vtltr7);
+      const float32x4_t vblbr67 = vcombine_f32(vblbr6, vblbr7);
+
+      const float32x4_t vldrd01 = vsubq_f32(vblbr01, vtltr01);
+      const float32x4_t vldrd23 = vsubq_f32(vblbr23, vtltr23);
+      const float32x4_t vldrd45 = vsubq_f32(vblbr45, vtltr45);
+      const float32x4_t vldrd67 = vsubq_f32(vblbr67, vtltr67);
+
+      const float32x4x2_t vld_t0123 = vuzpq_f32(vldrd01, vldrd23);
+      const float32x4_t vld0123 = vld_t0123.val[0];
+      const float32x4_t vrd0123 = vld_t0123.val[1];
+      const float32x4x2_t vld_t4567 = vuzpq_f32(vldrd45, vldrd67);
+      const float32x4_t vld4567 = vld_t4567.val[0];
+      const float32x4_t vrd4567 = vld_t4567.val[1];
+
+      const float32x4x2_t vtl_t0123 = vuzpq_f32(vtltr01, vtltr23);
+      const float32x4_t vtl0123 = vtl_t0123.val[0];
+      const float32x4_t vtr0123 = vtl_t0123.val[1];
+      const float32x4x2_t vtl_t4567 = vuzpq_f32(vtltr45, vtltr67);
+      const float32x4_t vtl4567 = vtl_t4567.val[0];
+      const float32x4_t vtr4567 = vtl_t4567.val[1];
+
+      const float32x4_t vl0123 = vmlaq_f32(vtl0123, vld0123, valphav0123);
+      const float32x4_t vr0123 = vmlaq_f32(vtr0123, vrd0123, valphav0123);
+      const float32x4_t vl4567 = vmlaq_f32(vtl4567, vld4567, valphav4567);
+      const float32x4_t vr4567 = vmlaq_f32(vtr4567, vrd4567, valphav4567);
+
+      const float32x4_t vd0123 = vsubq_f32(vr0123, vl0123);
+      const float32x4_t vd4567 = vsubq_f32(vr4567, vl4567);
+
+      const float32x4_t vo0123 = vmlaq_f32(vl0123, vd0123, valphah0123);
+      const float32x4_t vo4567 = vmlaq_f32(vl4567, vd4567, valphah4567);
+
+      vst1q_f32(output + 0, vo0123);
+      vst1q_f32(output + 4, vo4567);
+      output += 8;
+    }
+
+    for (; p >= 4; p -= 4) {
+      const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset);
+      const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset);
+      const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset);
+      const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset);
+      const float* itl2 = (const float*) ((uintptr_t) i[4] + input_offset);
+      const float* ibl2 = (const float*) ((uintptr_t) i[5] + input_offset);
+      const float* itl3 = (const float*) ((uintptr_t) i[6] + input_offset);
+      const float* ibl3 = (const float*) ((uintptr_t) i[7] + input_offset);
+      i += 8;
+
+      const float32x4x2_t vw = vld2q_f32(w);
+      w += 8;
+
+      const float32x2_t vtltr0 = vld1_f32(itl0);
+      const float32x2_t vblbr0 = vld1_f32(ibl0);
+      const float32x2_t vtltr1 = vld1_f32(itl1);
+      const float32x2_t vblbr1 = vld1_f32(ibl1);
+      const float32x2_t vtltr2 = vld1_f32(itl2);
+      const float32x2_t vblbr2 = vld1_f32(ibl2);
+      const float32x2_t vtltr3 = vld1_f32(itl3);
+      const float32x2_t vblbr3 = vld1_f32(ibl3);
+
+      const float32x4_t valphah = vw.val[0];
+      const float32x4_t valphav = vw.val[1];
+
+      const float32x4_t vtltr01 = vcombine_f32(vtltr0, vtltr1);
+      const float32x4_t vblbr01 = vcombine_f32(vblbr0, vblbr1);
+      const float32x4_t vtltr23 = vcombine_f32(vtltr2, vtltr3);
+      const float32x4_t vblbr23 = vcombine_f32(vblbr2, vblbr3);
+
+      const float32x4_t vldrd01 = vsubq_f32(vblbr01, vtltr01);
+      const float32x4_t vldrd23 = vsubq_f32(vblbr23, vtltr23);
+
+      const float32x4x2_t vld_t = vuzpq_f32(vldrd01, vldrd23);
+      const float32x4_t vld = vld_t.val[0];
+      const float32x4_t vrd = vld_t.val[1];
+
+      const float32x4x2_t vtl_t = vuzpq_f32(vtltr01, vtltr23);
+      const float32x4_t vtl = vtl_t.val[0];
+      const float32x4_t vtr = vtl_t.val[1];
+
+      const float32x4_t vl = vmlaq_f32(vtl, vld, valphav);
+      const float32x4_t vr = vmlaq_f32(vtr, vrd, valphav);
+
+      const float32x4_t vd = vsubq_f32(vr, vl);
+      const float32x4_t vo = vmlaq_f32(vl, vd, valphah);
+
+      vst1q_f32(output, vo);
+      output += 4;
+    }
+
+    if XNN_UNLIKELY(p != 0) {
+      if (p & 2) {
+        const float32x2x2_t vw = vld2_f32(w);
+        w += 4;
+
+        const float32x2_t valphah = vw.val[0];
+        const float32x2_t valphav = vw.val[1];
+
+        const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset);
+        const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset);
+        const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset);
+        const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset);
+        i += 4;
+
+        const float32x2_t vtltr0 = vld1_f32(itl0);
+        const float32x2_t vblbr0 = vld1_f32(ibl0);
+        const float32x2_t vtltr1 = vld1_f32(itl1);
+        const float32x2_t vblbr1 = vld1_f32(ibl1);
+
+        const float32x2_t vldrd0 = vsub_f32(vblbr0, vtltr0);
+        const float32x2_t vldrd1 = vsub_f32(vblbr1, vtltr1);
+
+        const float32x2x2_t vld_t = vuzp_f32(vldrd0, vldrd1);
+        const float32x2_t vld = vld_t.val[0];
+        const float32x2_t vrd = vld_t.val[1];
+
+        const float32x2x2_t vtl_t = vuzp_f32(vtltr0, vtltr1);
+        const float32x2_t vtl = vtl_t.val[0];
+        const float32x2_t vtr = vtl_t.val[1];
+
+        const float32x2_t vl = vmla_f32(vtl, vld, valphav);
+        const float32x2_t vr = vmla_f32(vtr, vrd, valphav);
+
+        const float32x2_t vd = vsub_f32(vr, vl);
+        const float32x2_t vo = vmla_f32(vl, vd, valphah);
+
+        vst1_f32(output, vo);
+        output += 2;
+      }
+
+      if (p & 1) {
+        // We are computing the following formula:
+        //   result = (1 - alpha_h) * (1 - alpha_v) * top_left +
+        //                 alpha_h  * (1 - alpha_v) * top_right +
+        //            (1 - alpha_h) *      alpha_v  * bottom_left +
+        //                 alpha_h  *      alpha_v  * bottom_right.
+        //
+        // Rearranging gives
+        //   result =    left + alpha_h * (right        - left),
+        // where
+        //   left =  top_left + alpha_v * (bottom_left  - top_left),
+        //  right = top_right + alpha_v * (bottom_right - top_right).
+
+        const float alphah = *w;
+        const float32x2_t valphav = vld1_dup_f32(w + 1);
+        w += 2;
+
+        const float* itl = (const float*) ((uintptr_t) i[0] + input_offset);
+        const float* ibl = (const float*) ((uintptr_t) i[1] + input_offset);
+        i += 2;
+
+        const float32x2_t vtltr = vld1_f32(itl);
+        const float32x2_t vblbr = vld1_f32(ibl);
+
+        // Compute at once
+        //    left_diff = bottom_left  - top_left
+        //   right_diff = bottom_right - top_right
+        const float32x2_t vldrd = vsub_f32(vblbr, vtltr);
+        const float32x2_t vlr = vmla_f32(vtltr, vldrd, valphav);
+
+        // Extract them and compute the result.
+        const float l = vget_lane_f32(vlr, 0);
+        const float r = vget_lane_f32(vlr, 1);
+
+        *output++ = l + alphah * (r - l);
+      }
+    }
+
+    input_offset += input_increment;
+  } while (--channels != 0);
+}
diff --git a/src/f32-ibilinear-chw/gen/neonfma-p4.c b/src/f32-ibilinear-chw/gen/neonfma-p4.c
new file mode 100644
index 0000000..0675165
--- /dev/null
+++ b/src/f32-ibilinear-chw/gen/neonfma-p4.c
@@ -0,0 +1,167 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-ibilinear-chw/neon.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/ibilinear.h>
+
+
+void xnn_f32_ibilinear_chw_ukernel__neonfma_p4(
+    size_t output_pixels,
+    size_t channels,
+    const float**restrict input,
+    size_t input_offset,
+    const float*restrict weights,
+    float*restrict output,
+    size_t input_increment) XNN_DISABLE_TSAN
+{
+  assert(output_pixels != 0);
+  assert(channels != 0);
+  assert(input_increment % sizeof(float) == 0);
+
+  do {
+    const float** i = input;
+    const float* w = weights;
+    size_t p = output_pixels;
+
+    for (; p >= 4; p -= 4) {
+      const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset);
+      const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset);
+      const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset);
+      const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset);
+      const float* itl2 = (const float*) ((uintptr_t) i[4] + input_offset);
+      const float* ibl2 = (const float*) ((uintptr_t) i[5] + input_offset);
+      const float* itl3 = (const float*) ((uintptr_t) i[6] + input_offset);
+      const float* ibl3 = (const float*) ((uintptr_t) i[7] + input_offset);
+      i += 8;
+
+      const float32x4x2_t vw = vld2q_f32(w);
+      w += 8;
+
+      const float32x2_t vtltr0 = vld1_f32(itl0);
+      const float32x2_t vblbr0 = vld1_f32(ibl0);
+      const float32x2_t vtltr1 = vld1_f32(itl1);
+      const float32x2_t vblbr1 = vld1_f32(ibl1);
+      const float32x2_t vtltr2 = vld1_f32(itl2);
+      const float32x2_t vblbr2 = vld1_f32(ibl2);
+      const float32x2_t vtltr3 = vld1_f32(itl3);
+      const float32x2_t vblbr3 = vld1_f32(ibl3);
+
+      const float32x4_t valphah = vw.val[0];
+      const float32x4_t valphav = vw.val[1];
+
+      const float32x4_t vtltr01 = vcombine_f32(vtltr0, vtltr1);
+      const float32x4_t vblbr01 = vcombine_f32(vblbr0, vblbr1);
+      const float32x4_t vtltr23 = vcombine_f32(vtltr2, vtltr3);
+      const float32x4_t vblbr23 = vcombine_f32(vblbr2, vblbr3);
+
+      const float32x4_t vldrd01 = vsubq_f32(vblbr01, vtltr01);
+      const float32x4_t vldrd23 = vsubq_f32(vblbr23, vtltr23);
+
+      const float32x4x2_t vld_t = vuzpq_f32(vldrd01, vldrd23);
+      const float32x4_t vld = vld_t.val[0];
+      const float32x4_t vrd = vld_t.val[1];
+
+      const float32x4x2_t vtl_t = vuzpq_f32(vtltr01, vtltr23);
+      const float32x4_t vtl = vtl_t.val[0];
+      const float32x4_t vtr = vtl_t.val[1];
+
+      const float32x4_t vl = vfmaq_f32(vtl, vld, valphav);
+      const float32x4_t vr = vfmaq_f32(vtr, vrd, valphav);
+
+      const float32x4_t vd = vsubq_f32(vr, vl);
+      const float32x4_t vo = vfmaq_f32(vl, vd, valphah);
+
+      vst1q_f32(output, vo);
+      output += 4;
+    }
+
+    if XNN_UNLIKELY(p != 0) {
+      if (p & 2) {
+        const float32x2x2_t vw = vld2_f32(w);
+        w += 4;
+
+        const float32x2_t valphah = vw.val[0];
+        const float32x2_t valphav = vw.val[1];
+
+        const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset);
+        const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset);
+        const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset);
+        const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset);
+        i += 4;
+
+        const float32x2_t vtltr0 = vld1_f32(itl0);
+        const float32x2_t vblbr0 = vld1_f32(ibl0);
+        const float32x2_t vtltr1 = vld1_f32(itl1);
+        const float32x2_t vblbr1 = vld1_f32(ibl1);
+
+        const float32x2_t vldrd0 = vsub_f32(vblbr0, vtltr0);
+        const float32x2_t vldrd1 = vsub_f32(vblbr1, vtltr1);
+
+        const float32x2x2_t vld_t = vuzp_f32(vldrd0, vldrd1);
+        const float32x2_t vld = vld_t.val[0];
+        const float32x2_t vrd = vld_t.val[1];
+
+        const float32x2x2_t vtl_t = vuzp_f32(vtltr0, vtltr1);
+        const float32x2_t vtl = vtl_t.val[0];
+        const float32x2_t vtr = vtl_t.val[1];
+
+        const float32x2_t vl = vfma_f32(vtl, vld, valphav);
+        const float32x2_t vr = vfma_f32(vtr, vrd, valphav);
+
+        const float32x2_t vd = vsub_f32(vr, vl);
+        const float32x2_t vo = vfma_f32(vl, vd, valphah);
+
+        vst1_f32(output, vo);
+        output += 2;
+      }
+
+      if (p & 1) {
+        // We are computing the following formula:
+        //   result = (1 - alpha_h) * (1 - alpha_v) * top_left +
+        //                 alpha_h  * (1 - alpha_v) * top_right +
+        //            (1 - alpha_h) *      alpha_v  * bottom_left +
+        //                 alpha_h  *      alpha_v  * bottom_right.
+        //
+        // Rearranging gives
+        //   result =    left + alpha_h * (right        - left),
+        // where
+        //   left =  top_left + alpha_v * (bottom_left  - top_left),
+        //  right = top_right + alpha_v * (bottom_right - top_right).
+
+        const float alphah = *w;
+        const float32x2_t valphav = vld1_dup_f32(w + 1);
+        w += 2;
+
+        const float* itl = (const float*) ((uintptr_t) i[0] + input_offset);
+        const float* ibl = (const float*) ((uintptr_t) i[1] + input_offset);
+        i += 2;
+
+        const float32x2_t vtltr = vld1_f32(itl);
+        const float32x2_t vblbr = vld1_f32(ibl);
+
+        // Compute at once
+        //    left_diff = bottom_left  - top_left
+        //   right_diff = bottom_right - top_right
+        const float32x2_t vldrd = vsub_f32(vblbr, vtltr);
+        const float32x2_t vlr = vfma_f32(vtltr, vldrd, valphav);
+
+        // Extract them and compute the result.
+        const float l = vget_lane_f32(vlr, 0);
+        const float r = vget_lane_f32(vlr, 1);
+
+        *output++ = l + alphah * (r - l);
+      }
+    }
+
+    input_offset += input_increment;
+  } while (--channels != 0);
+}
diff --git a/src/f32-ibilinear-chw/gen/neonfma-p8.c b/src/f32-ibilinear-chw/gen/neonfma-p8.c
new file mode 100644
index 0000000..4fb499b
--- /dev/null
+++ b/src/f32-ibilinear-chw/gen/neonfma-p8.c
@@ -0,0 +1,255 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-ibilinear-chw/neon.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/ibilinear.h>
+
+
+void xnn_f32_ibilinear_chw_ukernel__neonfma_p8(
+    size_t output_pixels,
+    size_t channels,
+    const float**restrict input,
+    size_t input_offset,
+    const float*restrict weights,
+    float*restrict output,
+    size_t input_increment) XNN_DISABLE_TSAN
+{
+  assert(output_pixels != 0);
+  assert(channels != 0);
+  assert(input_increment % sizeof(float) == 0);
+
+  do {
+    const float** i = input;
+    const float* w = weights;
+    size_t p = output_pixels;
+    for (; p >= 8; p -= 8) {
+      const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset);
+      const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset);
+      const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset);
+      const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset);
+      const float* itl2 = (const float*) ((uintptr_t) i[4] + input_offset);
+      const float* ibl2 = (const float*) ((uintptr_t) i[5] + input_offset);
+      const float* itl3 = (const float*) ((uintptr_t) i[6] + input_offset);
+      const float* ibl3 = (const float*) ((uintptr_t) i[7] + input_offset);
+      const float* itl4 = (const float*) ((uintptr_t) i[8] + input_offset);
+      const float* ibl4 = (const float*) ((uintptr_t) i[9] + input_offset);
+      const float* itl5 = (const float*) ((uintptr_t) i[10] + input_offset);
+      const float* ibl5 = (const float*) ((uintptr_t) i[11] + input_offset);
+      const float* itl6 = (const float*) ((uintptr_t) i[12] + input_offset);
+      const float* ibl6 = (const float*) ((uintptr_t) i[13] + input_offset);
+      const float* itl7 = (const float*) ((uintptr_t) i[14] + input_offset);
+      const float* ibl7 = (const float*) ((uintptr_t) i[15] + input_offset);
+      i += 2 * 8;
+
+      const float32x4x2_t vw0123 = vld2q_f32(w + 0);
+      const float32x4x2_t vw4567 = vld2q_f32(w + 8);
+      w += 2 * 8;
+
+      const float32x2_t vtltr0 = vld1_f32(itl0);
+      const float32x2_t vblbr0 = vld1_f32(ibl0);
+      const float32x2_t vtltr1 = vld1_f32(itl1);
+      const float32x2_t vblbr1 = vld1_f32(ibl1);
+      const float32x2_t vtltr2 = vld1_f32(itl2);
+      const float32x2_t vblbr2 = vld1_f32(ibl2);
+      const float32x2_t vtltr3 = vld1_f32(itl3);
+      const float32x2_t vblbr3 = vld1_f32(ibl3);
+      const float32x2_t vtltr4 = vld1_f32(itl4);
+      const float32x2_t vblbr4 = vld1_f32(ibl4);
+      const float32x2_t vtltr5 = vld1_f32(itl5);
+      const float32x2_t vblbr5 = vld1_f32(ibl5);
+      const float32x2_t vtltr6 = vld1_f32(itl6);
+      const float32x2_t vblbr6 = vld1_f32(ibl6);
+      const float32x2_t vtltr7 = vld1_f32(itl7);
+      const float32x2_t vblbr7 = vld1_f32(ibl7);
+
+      const float32x4_t valphah0123 = vw0123.val[0];
+      const float32x4_t valphav0123 = vw0123.val[1];
+      const float32x4_t valphah4567 = vw4567.val[0];
+      const float32x4_t valphav4567 = vw4567.val[1];
+
+      const float32x4_t vtltr01 = vcombine_f32(vtltr0, vtltr1);
+      const float32x4_t vblbr01 = vcombine_f32(vblbr0, vblbr1);
+      const float32x4_t vtltr23 = vcombine_f32(vtltr2, vtltr3);
+      const float32x4_t vblbr23 = vcombine_f32(vblbr2, vblbr3);
+      const float32x4_t vtltr45 = vcombine_f32(vtltr4, vtltr5);
+      const float32x4_t vblbr45 = vcombine_f32(vblbr4, vblbr5);
+      const float32x4_t vtltr67 = vcombine_f32(vtltr6, vtltr7);
+      const float32x4_t vblbr67 = vcombine_f32(vblbr6, vblbr7);
+
+      const float32x4_t vldrd01 = vsubq_f32(vblbr01, vtltr01);
+      const float32x4_t vldrd23 = vsubq_f32(vblbr23, vtltr23);
+      const float32x4_t vldrd45 = vsubq_f32(vblbr45, vtltr45);
+      const float32x4_t vldrd67 = vsubq_f32(vblbr67, vtltr67);
+
+      const float32x4x2_t vld_t0123 = vuzpq_f32(vldrd01, vldrd23);
+      const float32x4_t vld0123 = vld_t0123.val[0];
+      const float32x4_t vrd0123 = vld_t0123.val[1];
+      const float32x4x2_t vld_t4567 = vuzpq_f32(vldrd45, vldrd67);
+      const float32x4_t vld4567 = vld_t4567.val[0];
+      const float32x4_t vrd4567 = vld_t4567.val[1];
+
+      const float32x4x2_t vtl_t0123 = vuzpq_f32(vtltr01, vtltr23);
+      const float32x4_t vtl0123 = vtl_t0123.val[0];
+      const float32x4_t vtr0123 = vtl_t0123.val[1];
+      const float32x4x2_t vtl_t4567 = vuzpq_f32(vtltr45, vtltr67);
+      const float32x4_t vtl4567 = vtl_t4567.val[0];
+      const float32x4_t vtr4567 = vtl_t4567.val[1];
+
+      const float32x4_t vl0123 = vfmaq_f32(vtl0123, vld0123, valphav0123);
+      const float32x4_t vr0123 = vfmaq_f32(vtr0123, vrd0123, valphav0123);
+      const float32x4_t vl4567 = vfmaq_f32(vtl4567, vld4567, valphav4567);
+      const float32x4_t vr4567 = vfmaq_f32(vtr4567, vrd4567, valphav4567);
+
+      const float32x4_t vd0123 = vsubq_f32(vr0123, vl0123);
+      const float32x4_t vd4567 = vsubq_f32(vr4567, vl4567);
+
+      const float32x4_t vo0123 = vfmaq_f32(vl0123, vd0123, valphah0123);
+      const float32x4_t vo4567 = vfmaq_f32(vl4567, vd4567, valphah4567);
+
+      vst1q_f32(output + 0, vo0123);
+      vst1q_f32(output + 4, vo4567);
+      output += 8;
+    }
+
+    for (; p >= 4; p -= 4) {
+      const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset);
+      const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset);
+      const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset);
+      const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset);
+      const float* itl2 = (const float*) ((uintptr_t) i[4] + input_offset);
+      const float* ibl2 = (const float*) ((uintptr_t) i[5] + input_offset);
+      const float* itl3 = (const float*) ((uintptr_t) i[6] + input_offset);
+      const float* ibl3 = (const float*) ((uintptr_t) i[7] + input_offset);
+      i += 8;
+
+      const float32x4x2_t vw = vld2q_f32(w);
+      w += 8;
+
+      const float32x2_t vtltr0 = vld1_f32(itl0);
+      const float32x2_t vblbr0 = vld1_f32(ibl0);
+      const float32x2_t vtltr1 = vld1_f32(itl1);
+      const float32x2_t vblbr1 = vld1_f32(ibl1);
+      const float32x2_t vtltr2 = vld1_f32(itl2);
+      const float32x2_t vblbr2 = vld1_f32(ibl2);
+      const float32x2_t vtltr3 = vld1_f32(itl3);
+      const float32x2_t vblbr3 = vld1_f32(ibl3);
+
+      const float32x4_t valphah = vw.val[0];
+      const float32x4_t valphav = vw.val[1];
+
+      const float32x4_t vtltr01 = vcombine_f32(vtltr0, vtltr1);
+      const float32x4_t vblbr01 = vcombine_f32(vblbr0, vblbr1);
+      const float32x4_t vtltr23 = vcombine_f32(vtltr2, vtltr3);
+      const float32x4_t vblbr23 = vcombine_f32(vblbr2, vblbr3);
+
+      const float32x4_t vldrd01 = vsubq_f32(vblbr01, vtltr01);
+      const float32x4_t vldrd23 = vsubq_f32(vblbr23, vtltr23);
+
+      const float32x4x2_t vld_t = vuzpq_f32(vldrd01, vldrd23);
+      const float32x4_t vld = vld_t.val[0];
+      const float32x4_t vrd = vld_t.val[1];
+
+      const float32x4x2_t vtl_t = vuzpq_f32(vtltr01, vtltr23);
+      const float32x4_t vtl = vtl_t.val[0];
+      const float32x4_t vtr = vtl_t.val[1];
+
+      const float32x4_t vl = vfmaq_f32(vtl, vld, valphav);
+      const float32x4_t vr = vfmaq_f32(vtr, vrd, valphav);
+
+      const float32x4_t vd = vsubq_f32(vr, vl);
+      const float32x4_t vo = vfmaq_f32(vl, vd, valphah);
+
+      vst1q_f32(output, vo);
+      output += 4;
+    }
+
+    if XNN_UNLIKELY(p != 0) {
+      if (p & 2) {
+        const float32x2x2_t vw = vld2_f32(w);
+        w += 4;
+
+        const float32x2_t valphah = vw.val[0];
+        const float32x2_t valphav = vw.val[1];
+
+        const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset);
+        const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset);
+        const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset);
+        const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset);
+        i += 4;
+
+        const float32x2_t vtltr0 = vld1_f32(itl0);
+        const float32x2_t vblbr0 = vld1_f32(ibl0);
+        const float32x2_t vtltr1 = vld1_f32(itl1);
+        const float32x2_t vblbr1 = vld1_f32(ibl1);
+
+        const float32x2_t vldrd0 = vsub_f32(vblbr0, vtltr0);
+        const float32x2_t vldrd1 = vsub_f32(vblbr1, vtltr1);
+
+        const float32x2x2_t vld_t = vuzp_f32(vldrd0, vldrd1);
+        const float32x2_t vld = vld_t.val[0];
+        const float32x2_t vrd = vld_t.val[1];
+
+        const float32x2x2_t vtl_t = vuzp_f32(vtltr0, vtltr1);
+        const float32x2_t vtl = vtl_t.val[0];
+        const float32x2_t vtr = vtl_t.val[1];
+
+        const float32x2_t vl = vfma_f32(vtl, vld, valphav);
+        const float32x2_t vr = vfma_f32(vtr, vrd, valphav);
+
+        const float32x2_t vd = vsub_f32(vr, vl);
+        const float32x2_t vo = vfma_f32(vl, vd, valphah);
+
+        vst1_f32(output, vo);
+        output += 2;
+      }
+
+      if (p & 1) {
+        // We are computing the following formula:
+        //   result = (1 - alpha_h) * (1 - alpha_v) * top_left +
+        //                 alpha_h  * (1 - alpha_v) * top_right +
+        //            (1 - alpha_h) *      alpha_v  * bottom_left +
+        //                 alpha_h  *      alpha_v  * bottom_right.
+        //
+        // Rearranging gives
+        //   result =    left + alpha_h * (right        - left),
+        // where
+        //   left =  top_left + alpha_v * (bottom_left  - top_left),
+        //  right = top_right + alpha_v * (bottom_right - top_right).
+
+        const float alphah = *w;
+        const float32x2_t valphav = vld1_dup_f32(w + 1);
+        w += 2;
+
+        const float* itl = (const float*) ((uintptr_t) i[0] + input_offset);
+        const float* ibl = (const float*) ((uintptr_t) i[1] + input_offset);
+        i += 2;
+
+        const float32x2_t vtltr = vld1_f32(itl);
+        const float32x2_t vblbr = vld1_f32(ibl);
+
+        // Compute at once
+        //    left_diff = bottom_left  - top_left
+        //   right_diff = bottom_right - top_right
+        const float32x2_t vldrd = vsub_f32(vblbr, vtltr);
+        const float32x2_t vlr = vfma_f32(vtltr, vldrd, valphav);
+
+        // Extract them and compute the result.
+        const float l = vget_lane_f32(vlr, 0);
+        const float r = vget_lane_f32(vlr, 1);
+
+        *output++ = l + alphah * (r - l);
+      }
+    }
+
+    input_offset += input_increment;
+  } while (--channels != 0);
+}
diff --git a/src/f32-ibilinear-chw/neon.c.in b/src/f32-ibilinear-chw/neon.c.in
new file mode 100644
index 0000000..8760b7d
--- /dev/null
+++ b/src/f32-ibilinear-chw/neon.c.in
@@ -0,0 +1,205 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert PIXEL_TILE >= 1
+$assert PIXEL_TILE % 4 == 0
+$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+$VMULADDQ_F32 = "vfmaq_f32" if FMA else "vmlaq_f32"
+$VMULADD_F32 = "vfma_f32" if FMA else "vmla_f32"
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/ibilinear.h>
+
+
+void xnn_f32_ibilinear_chw_ukernel__${"neonfma" if FMA else "neon"}_p${PIXEL_TILE}(
+    size_t output_pixels,
+    size_t channels,
+    const float**restrict input,
+    size_t input_offset,
+    const float*restrict weights,
+    float*restrict output,
+    size_t input_increment) XNN_DISABLE_TSAN
+{
+  assert(output_pixels != 0);
+  assert(channels != 0);
+  assert(input_increment % sizeof(float) == 0);
+
+  do {
+    const float** i = input;
+    const float* w = weights;
+    size_t p = output_pixels;
+    $if PIXEL_TILE > 4:
+      for (; p >= ${PIXEL_TILE}; p -= ${PIXEL_TILE}) {
+        $for P in range(PIXEL_TILE):
+          const float* itl${ABC[P]} = (const float*) ((uintptr_t) i[${2 * P}] + input_offset);
+          const float* ibl${ABC[P]} = (const float*) ((uintptr_t) i[${2 * P + 1}] + input_offset);
+        i += 2 * ${PIXEL_TILE};
+
+        $for P in range(0, PIXEL_TILE, 4):
+          const float32x4x2_t vw${ABC[P:P+4]} = vld2q_f32(w + ${2 * P});
+        w += 2 * ${PIXEL_TILE};
+
+        $for P in range(0, PIXEL_TILE):
+          const float32x2_t vtltr${ABC[P]} = vld1_f32(itl${P});
+          const float32x2_t vblbr${ABC[P]} = vld1_f32(ibl${P});
+
+        $for P in range(0, PIXEL_TILE, 4):
+          const float32x4_t valphah${ABC[P:P+4]} = vw${ABC[P:P+4]}.val[0];
+          const float32x4_t valphav${ABC[P:P+4]} = vw${ABC[P:P+4]}.val[1];
+
+        $for P in range(0, PIXEL_TILE, 2):
+          const float32x4_t vtltr${ABC[P:P+2]} = vcombine_f32(vtltr${ABC[P]}, vtltr${ABC[P+1]});
+          const float32x4_t vblbr${ABC[P:P+2]} = vcombine_f32(vblbr${ABC[P]}, vblbr${ABC[P+1]});
+
+        $for P in range(0, PIXEL_TILE, 2):
+          const float32x4_t vldrd${ABC[P:P+2]} = vsubq_f32(vblbr${ABC[P:P+2]}, vtltr${ABC[P:P+2]});
+
+        $for P in range(0, PIXEL_TILE, 4):
+          const float32x4x2_t vld_t${ABC[P:P+4]} = vuzpq_f32(vldrd${ABC[P:P+2]}, vldrd${ABC[P+2:P+4]});
+          const float32x4_t vld${ABC[P:P+4]} = vld_t${ABC[P:P+4]}.val[0];
+          const float32x4_t vrd${ABC[P:P+4]} = vld_t${ABC[P:P+4]}.val[1];
+
+        $for P in range(0, PIXEL_TILE, 4):
+          const float32x4x2_t vtl_t${ABC[P:P+4]} = vuzpq_f32(vtltr${ABC[P:P+2]}, vtltr${ABC[P+2:P+4]});
+          const float32x4_t vtl${ABC[P:P+4]} = vtl_t${ABC[P:P+4]}.val[0];
+          const float32x4_t vtr${ABC[P:P+4]} = vtl_t${ABC[P:P+4]}.val[1];
+
+        $for P in range(0, PIXEL_TILE, 4):
+          const float32x4_t vl${ABC[P:P+4]} = ${VMULADDQ_F32}(vtl${ABC[P:P+4]}, vld${ABC[P:P+4]}, valphav${ABC[P:P+4]});
+          const float32x4_t vr${ABC[P:P+4]} = ${VMULADDQ_F32}(vtr${ABC[P:P+4]}, vrd${ABC[P:P+4]}, valphav${ABC[P:P+4]});
+
+        $for P in range(0, PIXEL_TILE, 4):
+          const float32x4_t vd${ABC[P:P+4]} = vsubq_f32(vr${ABC[P:P+4]}, vl${ABC[P:P+4]});
+
+        $for P in range(0, PIXEL_TILE, 4):
+          const float32x4_t vo${ABC[P:P+4]} = ${VMULADDQ_F32}(vl${ABC[P:P+4]}, vd${ABC[P:P+4]}, valphah${ABC[P:P+4]});
+
+        $for P in range(0, PIXEL_TILE, 4):
+          vst1q_f32(output + ${P}, vo${ABC[P:P+4]});
+        output += ${PIXEL_TILE};
+      }
+
+    for (; p >= 4; p -= 4) {
+      $for P in range(4):
+        const float* itl${P} = (const float*) ((uintptr_t) i[${2 * P}] + input_offset);
+        const float* ibl${P} = (const float*) ((uintptr_t) i[${2 * P + 1}] + input_offset);
+      i += 8;
+
+      const float32x4x2_t vw = vld2q_f32(w);
+      w += 8;
+
+      $for P in range(0, 4):
+        const float32x2_t vtltr${ABC[P]} = vld1_f32(itl${P});
+        const float32x2_t vblbr${ABC[P]} = vld1_f32(ibl${P});
+
+      const float32x4_t valphah = vw.val[0];
+      const float32x4_t valphav = vw.val[1];
+
+      $for P in range(0, 4, 2):
+        const float32x4_t vtltr${ABC[P:P+2]} = vcombine_f32(vtltr${ABC[P]}, vtltr${ABC[P+1]});
+        const float32x4_t vblbr${ABC[P:P+2]} = vcombine_f32(vblbr${ABC[P]}, vblbr${ABC[P+1]});
+
+      $for P in range(0, 4, 2):
+        const float32x4_t vldrd${ABC[P:P+2]} = vsubq_f32(vblbr${ABC[P:P+2]}, vtltr${ABC[P:P+2]});
+
+      const float32x4x2_t vld_t = vuzpq_f32(vldrd01, vldrd23);
+      const float32x4_t vld = vld_t.val[0];
+      const float32x4_t vrd = vld_t.val[1];
+
+      const float32x4x2_t vtl_t = vuzpq_f32(vtltr01, vtltr23);
+      const float32x4_t vtl = vtl_t.val[0];
+      const float32x4_t vtr = vtl_t.val[1];
+
+      const float32x4_t vl = ${VMULADDQ_F32}(vtl, vld, valphav);
+      const float32x4_t vr = ${VMULADDQ_F32}(vtr, vrd, valphav);
+
+      const float32x4_t vd = vsubq_f32(vr, vl);
+      const float32x4_t vo = ${VMULADDQ_F32}(vl, vd, valphah);
+
+      vst1q_f32(output, vo);
+      output += 4;
+    }
+
+    if XNN_UNLIKELY(p != 0) {
+      if (p & 2) {
+        const float32x2x2_t vw = vld2_f32(w);
+        w += 4;
+
+        const float32x2_t valphah = vw.val[0];
+        const float32x2_t valphav = vw.val[1];
+
+        $for P in range(2):
+          const float* itl${P} = (const float*) ((uintptr_t) i[${2 * P}] + input_offset);
+          const float* ibl${P} = (const float*) ((uintptr_t) i[${2 * P + 1}] + input_offset);
+        i += 4;
+
+        $for P in range(0, 2):
+          const float32x2_t vtltr${ABC[P]} = vld1_f32(itl${P});
+          const float32x2_t vblbr${ABC[P]} = vld1_f32(ibl${P});
+
+        $for P in range(0, 2):
+          const float32x2_t vldrd${ABC[P]} = vsub_f32(vblbr${ABC[P]}, vtltr${ABC[P]});
+
+        const float32x2x2_t vld_t = vuzp_f32(vldrd0, vldrd1);
+        const float32x2_t vld = vld_t.val[0];
+        const float32x2_t vrd = vld_t.val[1];
+
+        const float32x2x2_t vtl_t = vuzp_f32(vtltr0, vtltr1);
+        const float32x2_t vtl = vtl_t.val[0];
+        const float32x2_t vtr = vtl_t.val[1];
+
+        const float32x2_t vl = ${VMULADD_F32}(vtl, vld, valphav);
+        const float32x2_t vr = ${VMULADD_F32}(vtr, vrd, valphav);
+
+        const float32x2_t vd = vsub_f32(vr, vl);
+        const float32x2_t vo = ${VMULADD_F32}(vl, vd, valphah);
+
+        vst1_f32(output, vo);
+        output += 2;
+      }
+
+      if (p & 1) {
+        // We are computing the following formula:
+        //   result = (1 - alpha_h) * (1 - alpha_v) * top_left +
+        //                 alpha_h  * (1 - alpha_v) * top_right +
+        //            (1 - alpha_h) *      alpha_v  * bottom_left +
+        //                 alpha_h  *      alpha_v  * bottom_right.
+        //
+        // Rearranging gives
+        //   result =    left + alpha_h * (right        - left),
+        // where
+        //   left =  top_left + alpha_v * (bottom_left  - top_left),
+        //  right = top_right + alpha_v * (bottom_right - top_right).
+
+        const float alphah = *w;
+        const float32x2_t valphav = vld1_dup_f32(w + 1);
+        w += 2;
+
+        const float* itl = (const float*) ((uintptr_t) i[0] + input_offset);
+        const float* ibl = (const float*) ((uintptr_t) i[1] + input_offset);
+        i += 2;
+
+        const float32x2_t vtltr = vld1_f32(itl);
+        const float32x2_t vblbr = vld1_f32(ibl);
+
+        // Compute at once
+        //    left_diff = bottom_left  - top_left
+        //   right_diff = bottom_right - top_right
+        const float32x2_t vldrd = vsub_f32(vblbr, vtltr);
+        const float32x2_t vlr = ${VMULADD_F32}(vtltr, vldrd, valphav);
+
+        // Extract them and compute the result.
+        const float l = vget_lane_f32(vlr, 0);
+        const float r = vget_lane_f32(vlr, 1);
+
+        *output++ = l + alphah * (r - l);
+      }
+    }
+
+    input_offset += input_increment;
+  } while (--channels != 0);
+}
diff --git a/src/init.c b/src/init.c
index e741e5f..587bdaf 100644
--- a/src/init.c
+++ b/src/init.c
@@ -104,12 +104,23 @@
     #ifndef XNN_NO_QS8_OPERATORS
       init_flags |= XNN_INIT_FLAG_QS8;
 
-      xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane);
-      xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_2x8__neon_mlal_lane);
-      xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x8__neon_mlal_lane);
-      xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x8__neon_mlal_lane);
-      xnn_params.qs8.gemm.mr = 2;
-      xnn_params.qs8.gemm.nr = 8;
+      if (cpuinfo_has_arm_neon_dot()) {
+        xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_4x8c4__neondot);
+        xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x8c4__neondot);
+        xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_4x8c4__neondot);
+        xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x8c4__neondot);
+        xnn_params.qs8.gemm.mr = 4;
+        xnn_params.qs8.gemm.nr = 8;
+        xnn_params.qs8.gemm.log2_kr = 2;
+      } else {
+        xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+        xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+        xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+        xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+        xnn_params.qs8.gemm.mr = 2;
+        xnn_params.qs8.gemm.nr = 8;
+        xnn_params.qs8.gemm.log2_kr = 1;
+      }
 
       xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_ukernel_up8x9__neon_mul16;
       xnn_params.qs8.dwconv[0].channel_tile = 8;
@@ -458,7 +469,7 @@
           .channel_tile = 4,
         };
         xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
-          .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__scalar_p4,
+          .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__neon_p8,
           .channel_tile = 1,
           .pixel_tile = 4,
         };
@@ -798,19 +809,20 @@
       #if XNN_ENABLE_ASSEMBLY
         if (cpuinfo_has_arm_neon_dot()) {
           xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64);
-          xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld64);
-          xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot);
+          xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot);
+          xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64);
           xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot);
           xnn_params.qs8.gemm.mr = 4;
           xnn_params.qs8.gemm.nr = 16;
           xnn_params.qs8.gemm.log2_kr = 2;
         } else {
-          xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane);
-          xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_2x8__neon_mlal_lane);
-          xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x8__neon_mlal_lane);
-          xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x8__neon_mlal_lane);
+          xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+          xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+          xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+          xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal);
           xnn_params.qs8.gemm.mr = 2;
           xnn_params.qs8.gemm.nr = 8;
+          xnn_params.qs8.gemm.log2_kr = 3;
         }
       #else  // !XNN_ENABLE_ASSEMBLY
         if (cpuinfo_has_arm_neon_dot()) {
@@ -822,12 +834,13 @@
           xnn_params.qs8.gemm.nr = 16;
           xnn_params.qs8.gemm.log2_kr = 2;
         } else {
-          xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane);
-          xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_2x8__neon_mlal_lane);
-          xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x8__neon_mlal_lane);
-          xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x8__neon_mlal_lane);
+          xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+          xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+          xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+          xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
           xnn_params.qs8.gemm.mr = 2;
           xnn_params.qs8.gemm.nr = 8;
+          xnn_params.qs8.gemm.log2_kr = 1;
         }
       #endif  // XNN_ENABLE_ASSEMBLY
     #else  // !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
@@ -836,24 +849,26 @@
           switch (cpuinfo_get_core(0)->uarch) {
             case cpuinfo_uarch_cortex_a55:
               xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55);
+              xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55);
               break;
             default:
               xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64);
+              xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64);
               break;
           }
-          xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot);
-          xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld64);
+          xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot);
           xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot);
           xnn_params.qs8.gemm.mr = 4;
           xnn_params.qs8.gemm.nr = 16;
           xnn_params.qs8.gemm.log2_kr = 2;
         } else {
-          xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane);
-          xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_2x8__neon_mlal_lane);
-          xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x8__neon_mlal_lane);
-          xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x8__neon_mlal_lane);
+          xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+          xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+          xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+          xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal);
           xnn_params.qs8.gemm.mr = 2;
           xnn_params.qs8.gemm.nr = 8;
+          xnn_params.qs8.gemm.log2_kr = 3;
         }
         #if XNN_MAX_UARCH_TYPES > 1
         {
@@ -872,8 +887,8 @@
               case cpuinfo_uarch_cortex_a55:
                 if (mr == 4 && nr == 16 && log2_kr == 2) {
                   xnn_params.qs8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55;
-                  xnn_params.qs8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot;
-                  xnn_params.qs8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld64;
+                  xnn_params.qs8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55;
+                  xnn_params.qs8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot;
                   xnn_params.qs8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot;
                 }
                 break;
@@ -893,12 +908,13 @@
           xnn_params.qs8.gemm.nr = 16;
           xnn_params.qs8.gemm.log2_kr = 2;
         } else {
-          xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane);
-          xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_2x8__neon_mlal_lane);
-          xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x8__neon_mlal_lane);
-          xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x8__neon_mlal_lane);
+          xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+          xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+          xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+          xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
           xnn_params.qs8.gemm.mr = 2;
           xnn_params.qs8.gemm.nr = 8;
+          xnn_params.qs8.gemm.log2_kr = 1;
         }
       #endif  // XNN_ENABLE_ASSEMBLY
     #endif  // XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
@@ -1335,18 +1351,18 @@
       init_flags |= XNN_INIT_FLAG_CHW_OPT;
 
       xnn_params.f32.spmm = (struct spmm_parameters) {
-        .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_16x1__neonfma_pipelined,
-        .mr = 16,
+        .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x1__neonfma_pipelined,
+        .mr = 32,
         .nr = 1,
       };
       xnn_params.f32.spmm2 = (struct spmm_parameters) {
-        .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_16x2__neonfma,
-        .mr = 16,
+        .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x2__neonfma,
+        .mr = 32,
         .nr = 2,
       };
       xnn_params.f32.spmm4 = (struct spmm_parameters) {
-        .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_16x4__neonfma,
-        .mr = 16,
+        .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x4__neonfma,
+        .mr = 32,
         .nr = 4,
       };
       xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
@@ -1381,7 +1397,7 @@
         .channel_tile = 4,
       };
       xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
-        .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__scalar_p4,
+        .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__neonfma_p8,
         .channel_tile = 1,
         .pixel_tile = 4,
       };
diff --git a/src/math/exp-avx512f-rr2-lut16-p3-perm.c b/src/math/exp-avx512f-rr2-lut16-p3-perm.c
index 367e9db..05e74ab 100644
--- a/src/math/exp-avx512f-rr2-lut16-p3-perm.c
+++ b/src/math/exp-avx512f-rr2-lut16-p3-perm.c
@@ -19,11 +19,11 @@
   assert(n % (16 * sizeof(float)) == 0);
 
   const __m512 vmagic_bias = _mm512_set1_ps(0x1.800000p23f);
+  const __m512 vlog2e_x16  = _mm512_set1_ps(0x1.715476p4f);
   // The smallest x for which expf(x) is non-zero.
   const __m512 vzero_cutoff = _mm512_set1_ps(-0x1.9FE368p6f);
   // The largest x for which expf(x) is finite.
   const __m512 vinf_cutoff = _mm512_set1_ps(0x1.62E42Ep6f);
-  const __m512 vlog2e_x16  = _mm512_set1_ps(0x1.715476p4f);
   const __m512 vminus_ln2_o16_hi = _mm512_set1_ps(-0x1.62e43p-5f);
   const __m512 vminus_ln2_o16_lo = _mm512_set1_ps(0x1.05c61p-33f);
   const __m512 vplus_inf = _mm512_set1_ps(INFINITY);
@@ -52,6 +52,10 @@
     // inputs at the very end of the algorithm.
     __m512 vn = _mm512_fmadd_ps(vx, vlog2e_x16, vmagic_bias);
 
+    // Detect underflow and overflow of expf(x) for further special handling.
+    const __mmask16 vinvof = _mm512_cmp_ps_mask(vx, vinf_cutoff, _CMP_NGT_UQ);
+    const __mmask16 vinvuf = _mm512_cmp_ps_mask(vx, vzero_cutoff, _CMP_NLT_UQ);
+
     // Create two floating-point numbers, sn (scale, normal) and so (scale, overflow) such that sn * so == 2**n
     // for inputs which don't cause overflow, i.e. -103.97207 <= x <= 88.72283, and -150 <= n <= 128 accordingly.
     // We need to use two numbers rather than one because a normalized single-precision exponent must be in [-127, 126]
@@ -64,7 +68,7 @@
     ven = _mm512_min_epi32(ven, vmax_exponent);
     veo = _mm512_sub_epi32(veo, ven);
     const __m512 vsn = _mm512_castsi512_ps(_mm512_add_epi32(ven, vdefault_exponent));
-    const __m512 vso = _mm512_castsi512_ps(_mm512_add_epi32(veo, vdefault_exponent));
+    const __m512 vso = _mm512_castsi512_ps(_mm512_maskz_add_epi32(vinvuf, veo, vdefault_exponent));
 
     // Use the low 4 bits of n (as integer) for table lookup.
     const __m512 vl = _mm512_permutexvar_ps(_mm512_castps_si512(vn), vtable);
@@ -90,10 +94,10 @@
 
     // For inputs below zero cutoff, replace output with +0.0f.
     // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf = _mm512_maskz_mul_ps(_mm512_cmp_ps_mask(vx, vzero_cutoff, _CMP_NLT_US), vf, vsn);
+    vf = _mm512_maskz_mul_ps(vinvuf, vf, vsn);
     // For inputs above inf cutoff, replace output with +inf.
     // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf = _mm512_mask_mul_ps(vplus_inf, _mm512_cmp_ps_mask(vx, vinf_cutoff, _CMP_NGT_US), vso, vf);
+    vf = _mm512_mask_mul_ps(vplus_inf, vinvof, vso, vf);
     _mm512_storeu_ps(output, vf);
 
     input += 16;
diff --git a/src/math/exp-avx512f-rr2-lut32-p2-perm2.c b/src/math/exp-avx512f-rr2-lut32-p2-perm2.c
index b8545cc..eaf2a1b 100644
--- a/src/math/exp-avx512f-rr2-lut32-p2-perm2.c
+++ b/src/math/exp-avx512f-rr2-lut32-p2-perm2.c
@@ -19,11 +19,11 @@
   assert(n % (16 * sizeof(float)) == 0);
 
   const __m512 vmagic_bias = _mm512_set1_ps(0x1.800000p23f);
+  const __m512 vlog2e_x32  = _mm512_set1_ps(0x1.715476p5f);
   // The smallest x for which expf(x) is non-zero.
   const __m512 vzero_cutoff = _mm512_set1_ps(-0x1.9FE368p6f);
   // The largest x for which expf(x) is finite.
   const __m512 vinf_cutoff = _mm512_set1_ps(0x1.62E42Ep6f);
-  const __m512 vlog2e_x32  = _mm512_set1_ps(0x1.715476p5f);
   const __m512 vminus_ln2_o32_hi = _mm512_set1_ps(-0x1.62e43p-6f);
   const __m512 vminus_ln2_o32_lo = _mm512_set1_ps(0x1.05c61p-34f);
   const __m512 vplus_inf = _mm512_set1_ps(INFINITY);
@@ -57,6 +57,10 @@
     // inputs at the very end of the algorithm.
     __m512 vn = _mm512_fmadd_ps(vx, vlog2e_x32, vmagic_bias);
 
+    // Detect underflow and overflow of expf(x) for further special handling.
+    const __mmask16 vinvof = _mm512_cmp_ps_mask(vx, vinf_cutoff, _CMP_NGT_UQ);
+    const __mmask16 vinvuf = _mm512_cmp_ps_mask(vx, vzero_cutoff, _CMP_NLT_UQ);
+
     // Create two floating-point numbers, sn (scale, normal) and so (scale, overflow) such that sn * so == 2**n
     // for inputs which don't cause overflow, i.e. -103.97207 <= x <= 88.72283, and -150 <= n <= 128 accordingly.
     // We need to use two numbers rather than one because a normalized single-precision exponent must be in [-127, 126]
@@ -69,7 +73,7 @@
     ven = _mm512_min_epi32(ven, vmax_exponent);
     veo = _mm512_sub_epi32(veo, ven);
     const __m512 vsn = _mm512_castsi512_ps(_mm512_add_epi32(ven, vdefault_exponent));
-    const __m512 vso = _mm512_castsi512_ps(_mm512_add_epi32(veo, vdefault_exponent));
+    const __m512 vso = _mm512_castsi512_ps(_mm512_maskz_add_epi32(vinvuf, veo, vdefault_exponent));
 
     // Use the low 5 bits of n (as integer) for table lookup.
     const __m512 vl = _mm512_permutex2var_ps(vtable_lo, _mm512_castps_si512(vn), vtable_hi);
@@ -94,10 +98,10 @@
 
     // For inputs below zero cutoff, replace output with +0.0f.
     // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf = _mm512_maskz_mul_ps(_mm512_cmp_ps_mask(vx, vzero_cutoff, _CMP_NLT_US), vf, vsn);
+    vf = _mm512_maskz_mul_ps(vinvuf, vf, vsn);
     // For inputs above inf cutoff, replace output with +inf.
     // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf = _mm512_mask_mul_ps(vplus_inf, _mm512_cmp_ps_mask(vx, vinf_cutoff, _CMP_NGT_US), vso, vf);
+    vf = _mm512_mask_mul_ps(vplus_inf, vinvof, vso, vf);
     _mm512_storeu_ps(output, vf);
 
     input += 16;
diff --git a/src/math/exp-avx512f-rr2-p5-scalef.c b/src/math/exp-avx512f-rr2-p5-scalef.c
index e07b63c..ff275d1 100644
--- a/src/math/exp-avx512f-rr2-p5-scalef.c
+++ b/src/math/exp-avx512f-rr2-p5-scalef.c
@@ -18,6 +18,12 @@
   assert(n % (16 * sizeof(float)) == 0);
 
   const __m512 vlog2e = _mm512_set1_ps(0x1.715476p+0f);
+
+  // The smallest x for which expf(x) is non-zero.
+  const __m512 vzero_cutoff = _mm512_set1_ps(-0x1.9FE368p+6f);
+  // The largest x for which expf(x) is finite.
+  const __m512 vinf_cutoff = _mm512_set1_ps(0x1.62E42Ep+6f);
+
   const __m512 vminus_ln2_hi = _mm512_set1_ps(-0x1.62E43p-1f);
   const __m512 vminus_ln2_lo = _mm512_set1_ps(0x1.05C61p-29f);
 
@@ -33,10 +39,23 @@
 
     // Compute reduced argument n := round(x / log(2)).
     const __m512 vn = _mm512_roundscale_ps(_mm512_mul_ps(vx, vlog2e), 0);
+
+    // Detect underflow and overflow of expf(x) for further special handling.
+    // For large positive or negative inputs the range reduction  may produce degenerate reduced arguments:
+    // - Reduced argument t can fall outside of [-log(2)/2, log(2)/2] range, leading to polynomial approximation p
+    //   being negative, and exp(n) * p being either -0.0f (in underflow case) or -inf (in overflow case) instead of
+    //   +0.0f and +inf respectively.
+    // - Reduced argument n can overflow and become +inf or -inf, and leading to NaN in reduced argument t.
+    const __mmask16 vinvof = _mm512_cmp_ps_mask(vx, vinf_cutoff, _CMP_NGT_UQ);
+    const __mmask16 vinvuf = _mm512_cmp_ps_mask(vx, vzero_cutoff, _CMP_NLT_UQ);
+
     // Compute reduced argument t := x - n * log(2).
     // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    // Use masking to explicitly zero the result for large positive inputs, to avoid propagating NaN in reduced
+    // argument t into further computations. Zeroing the reduced argument t would instead result in polynomial
+    // approximation being 1.0f, which correctly overflows to +inf when scaled by n = +inf.
     __m512 vt = _mm512_fmadd_ps(vn, vminus_ln2_hi, vx);
-    vt = _mm512_fmadd_ps(vn, vminus_ln2_lo, vt);
+    vt = _mm512_maskz_fmadd_ps(vinvof, vn, vminus_ln2_lo, vt);
 
     // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2].
     __m512 vp = _mm512_fmadd_ps(vc5, vt, vc4);
@@ -46,7 +65,9 @@
     vp = _mm512_fmadd_ps(vp, vt, vc0);
 
     // Reconstruct the final value as f = exp2(n) * p.
-    const __m512 vf = _mm512_scalef_ps(vp, vn);
+    // Use masking to explicitly zero (set to +0.0f) the result for large negative inputs, because for some of these
+    // inputs the polynomial approximation p is negative and thus exp2(n) * p == -0.0f.
+    const __m512 vf = _mm512_maskz_scalef_ps(vinvuf, vp, vn);
     _mm512_storeu_ps(output, vf);
 
     input += 16;
diff --git a/src/operator-run.c b/src/operator-run.c
index 1dc06fb..bf6b2bc 100644
--- a/src/operator-run.c
+++ b/src/operator-run.c
@@ -453,10 +453,7 @@
     context->block_size,
     (const void*) ((uintptr_t) context->input + batch_index * context->input_batch_stride),
     (void*) ((uintptr_t) context->output + batch_index * context->output_batch_stride),
-    context->input_channel_stride,
-    context->input_height_stride,
-    context->output_height_stride,
-    context->output_width_stride);
+    context->output_channel_stride);
 }
 
 void xnn_compute_argmax_pooling_unipass(
diff --git a/src/operators/convolution-nhwc.c b/src/operators/convolution-nhwc.c
index de7aab0..3ae0077 100644
--- a/src/operators/convolution-nhwc.c
+++ b/src/operators/convolution-nhwc.c
@@ -1136,10 +1136,6 @@
       convolution_op->compute.range[1] = output_height;
       convolution_op->state = xnn_run_state_ready;
 
-      convolution_op->last_input = input;
-      convolution_op->last_input_height = input_height;
-      convolution_op->last_input_width = input_width;
-
       return xnn_status_success;
     }
     case xnn_ukernel_type_vmulcaddc:
diff --git a/src/operators/depth-to-space-nchw2nhwc.c b/src/operators/depth-to-space-nchw2nhwc.c
index 5a51454..518f28d 100644
--- a/src/operators/depth-to-space-nchw2nhwc.c
+++ b/src/operators/depth-to-space-nchw2nhwc.c
@@ -138,10 +138,7 @@
     .output = output,
     .input_batch_stride = depth_to_space_op->input_pixel_stride * input_height * input_width * sizeof(float),
     .output_batch_stride = depth_to_space_op->output_pixel_stride * output_height * output_width * sizeof(float),
-    .input_channel_stride = input_height * input_width * sizeof(float),
-    .input_height_stride = input_width * sizeof(float),
-    .output_height_stride = depth_to_space_op->output_pixel_stride * output_width * sizeof(float),
-    .output_width_stride = depth_to_space_op->output_pixel_stride * sizeof(float),
+    .output_channel_stride = depth_to_space_op->output_pixel_stride,
     .ukernel = xnn_params.x32.depthtospace2d_chw2hwc.ukernel,
   };
 
diff --git a/src/qs8-gemm/1x16c4-aarch64-neondot-ld32.S b/src/qs8-gemm/1x16c4-aarch64-neondot-ld32.S
new file mode 100644
index 0000000..28a3c39
--- /dev/null
+++ b/src/qs8-gemm/1x16c4-aarch64-neondot-ld32.S
@@ -0,0 +1,118 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack/assembly.h>
+
+# void xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld32(
+#     size_t mr,                 x0
+#     size_t nc,                 x1
+#     size_t kc,                 x2 / x0
+#     const int8_t* restrict a,  x3
+#     size_t a_stride,           (x4)
+#     const void* restrict w,    x5
+#     int8_t* restrict c,        x6
+#     size_t cm_stride,          (x7)
+#     size_t cn_stride,          [sp] -> x12
+#     const union xnn_qs8_gemm_params params)  [sp + 8] -> x11
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0  x3 v0
+# B   x5 v16 v17 v18 v19
+# C0  x6 v28 v29 v30 v31
+# unused v4 v5 v6 v7 v8 v9 v10 v11 v12 v13 v14 v15
+
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld32
+0:
+        # Load initial bias from w into accumulators
+        ADD     x2, x2, 3           // kc = (kc + 3) & ~3
+        LDP     q28, q29, [x5], 32
+        BIC     x2, x2, 3
+        LDP     q30, q31, [x5], 32
+        MOV     x0, x2              // k = kc.  assumes kc > 0
+        LDR     x11, [sp, 8]        // params
+
+        # Main loop - 4 bytes of A
+        .p2align 3
+1:
+        LDR     s0,  [x3], 4
+        LDR     q16, [x5], 16
+        LDR     q17, [x5], 16
+        LDR     q18, [x5], 16
+        LDR     q19, [x5], 16
+        SDOT    v28.4s, v16.16b, v0.4b[0]
+        SDOT    v29.4s, v17.16b, v0.4b[0]
+        SUBS    x0, x0, 4
+        SDOT    v30.4s, v18.16b, v0.4b[0]
+        SDOT    v31.4s, v19.16b, v0.4b[0]
+        B.HI    1b
+
+        # Apply params - scale, shift, bias and clamp
+        LD2R    {v0.4s, v1.4s}, [x11], 8
+        CMEQ    v2.4s, v1.4s, 0
+        SQRDMULH  v4.4s, v28.4s, v0.4s
+        SQRDMULH  v5.4s, v29.4s, v0.4s
+        SQRDMULH  v6.4s, v30.4s, v0.4s
+        SQRDMULH  v7.4s, v31.4s, v0.4s
+        BIC     v28.16b, v28.16b, v2.16b
+        BIC     v29.16b, v29.16b, v2.16b
+        BIC     v30.16b, v30.16b, v2.16b
+        BIC     v31.16b, v31.16b, v2.16b
+        SSRA    v4.4s, v28.4s, 31  // signed shift right accumulate
+        SSRA    v5.4s, v29.4s, 31
+        SSRA    v6.4s, v30.4s, 31
+        SSRA    v7.4s, v31.4s, 31
+        SRSHL   v4.4s, v4.4s, v1.4s  // signed rounding shift left
+        SRSHL   v5.4s, v5.4s, v1.4s
+        SRSHL   v6.4s, v6.4s, v1.4s
+        SRSHL   v7.4s, v7.4s, v1.4s
+        LD1R    {v2.8h}, [x11], 2   // add bias
+        SQXTN   v4.4h, v4.4s
+        SQXTN   v6.4h, v6.4s
+        SQXTN2  v4.8h, v5.4s
+        SQXTN2  v6.8h, v7.4s
+        LD2R    {v0.16b, v1.16b}, [x11]   // clamp to min/max
+        SQADD   v4.8h, v4.8h, v2.8h
+        SQADD   v6.8h, v6.8h, v2.8h
+        LDR     x12, [sp]   // cn_stride
+        SQXTN   v4.8b, v4.8h
+        SQXTN2  v4.16b, v6.8h
+        SUBS    x1, x1, 16
+        SMAX    v4.16b, v4.16b, v0.16b
+        SMIN    v4.16b, v4.16b, v1.16b
+        B.LO    2f
+
+        # Store full 1 x 16
+        ST1     {v4.16b}, [x6], x12
+        SUB      x3,  x3, x2         // a0 -= kc
+        B.NE    0b
+        RET
+
+        # Store odd width
+        .p2align 3
+2:
+        TBZ     x1, 3, 3f
+        STR     d4, [x6], 8
+        DUP     d4, v4.d[1]
+3:
+        TBZ     x1, 2, 4f
+        STR     s4, [x6], 4
+        DUP     s4, v4.s[1]
+4:
+        TBZ     x1, 1, 5f
+        ST1     {v4.h}[0], [x6], 2
+        DUP     h4, v4.h[1]
+5:
+        TBZ     x1, 0, 6f
+        ST1     {v4.b}[0], [x6]
+6:
+        RET
+
+END_FUNCTION xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld32
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-gemm/1x16c4-aarch64-neondot-ld64.S b/src/qs8-gemm/1x16c4-aarch64-neondot-ld64.S
index 64b573a..adf736b 100644
--- a/src/qs8-gemm/1x16c4-aarch64-neondot-ld64.S
+++ b/src/qs8-gemm/1x16c4-aarch64-neondot-ld64.S
@@ -26,24 +26,25 @@
 # unused v8 v9 v10 v11 v12 v13 v14 v15
 
 BEGIN_FUNCTION xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld64
+        ADD      x2, x2, 3          // kc = (kc + 3) & ~3
+        BIC      x2, x2, 3
+
+        .p2align 3
 0:
         # Load initial bias from w into accumulators
         LDP     q28, q29, [x5], 32
+        SUBS     x0, x2, 8          // k = kc - 8
         LDP     q30, q31, [x5], 32
-        AND     x8, x2, 7  // remainder is 1 to 7 bytes.
-        LDR     x11, [sp, 8]   // params
+        LDR     x11, [sp, 8]        // params
 
         # Is there at least 8 bytes?
-        SUBS    x0, x2, 8  // k = kc - 8
         B.LO    3f
 
-
         # Main loop - 8 bytes of A
         .p2align 3
 1:
         LDR     d0,  [x3], 8
         LDR     q16, [x5, 0]
-        SUBS    x0, x0, 8
         LDR     q17, [x5, 16]
         SDOT    v28.4s, v16.16b, v0.4b[0]
         LDR     q18, [x5, 32]
@@ -60,20 +61,19 @@
         SDOT    v30.4s, v6.16b,  v0.4b[1]
         ADD     x5, x5, 128
         SDOT    v31.4s, v7.16b,  v0.4b[1]
+        SUBS    x0, x0, 8
         B.HS    1b
 
-        # Is there a remainder?- 1 to 7 bytes of A
-        CBNZ    x8, 3f
+        # Is there a remainder?- 1 to 4 bytes of A
+        TBNZ    x0, 2, 3f
 
-        .p2align 3
 2:
          # Apply params - scale, shift, bias and clamp
-        LD1R    {v0.4s}, [x11], 4
+        LD2R    {v0.4s, v1.4s}, [x11], 8
         SQRDMULH  v4.4s, v28.4s, v0.4s
-        LD1R    {v1.4s}, [x11], 4
         SQRDMULH  v5.4s, v29.4s, v0.4s
-        SQRDMULH  v6.4s, v30.4s, v0.4s
         CMEQ    v2.4s, v1.4s, 0
+        SQRDMULH  v6.4s, v30.4s, v0.4s
         SQRDMULH  v7.4s, v31.4s, v0.4s
         BIC     v28.16b, v28.16b, v2.16b
         BIC     v29.16b, v29.16b, v2.16b
@@ -105,17 +105,16 @@
 
         # Store full 1 x 16
         ST1     {v4.16b}, [x6], x12
-        SUB      x3,  x3, x2         // a0 -= kc
+        SUB     x3,  x3, x2          // a0 -= kc
         B.NE    0b
 
         RET
 
-        # Remainder- 1 to 7 bytes of A
+        # Remainder - 4 bytes of A
         .p2align 3
 3:
-        LD1     {v0.8b},  [x3], x8
+        LDR     s0,  [x3], 4
         LDR     q16, [x5, 0]
-        CMP     x8, 4
         LDR     q17, [x5, 16]
         SDOT    v28.4s, v16.16b, v0.4b[0]
         LDR     q18, [x5, 32]
@@ -124,17 +123,6 @@
         SDOT    v30.4s, v18.16b, v0.4b[0]
         ADD     x5, x5, 64
         SDOT    v31.4s, v19.16b, v0.4b[0]
-        B.LS    2b
-
-        LDR      q4, [x5, 0]
-        LDR      q5, [x5, 16]
-        SDOT    v28.4s, v4.16b,  v0.4b[1]
-        LDR      q6, [x5, 32]
-        SDOT    v29.4s, v5.16b,  v0.4b[1]
-        LDR      q7, [x5, 48]
-        SDOT    v30.4s, v6.16b,  v0.4b[1]
-        ADD     x5, x5, 64
-        SDOT    v31.4s, v7.16b,  v0.4b[1]
         B       2b
 
         # Store odd width
@@ -161,4 +149,4 @@
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
-#endif
\ No newline at end of file
+#endif
diff --git a/src/qs8-gemm/2x8c16-aarch64-neon-mlal-padal.S b/src/qs8-gemm/2x8c16-aarch64-neon-mlal-padal.S
new file mode 100644
index 0000000..1f9bff7
--- /dev/null
+++ b/src/qs8-gemm/2x8c16-aarch64-neon-mlal-padal.S
@@ -0,0 +1,218 @@
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack/assembly.h>
+
+# void xnn_qs8_gemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal(
+#     size_t mr,                 x0
+#     size_t nc,                 x1
+#     size_t kc,                 x2 / x0
+#     const int8_t* restrict a,  x3
+#     size_t a_stride,           x4
+#     const void* restrict w,    x5
+#     int8_t* restrict c,        x6
+#     size_t cm_stride,          x7
+#     size_t cn_stride,          [sp] -> x10
+#     const union xnn_qs8_gemm_params params)  [sp + 8] -> x9
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0  x3  v0
+# A1  x4  v1
+# B   x5  v4  v5  v6  v7
+# C0  x7 v16 v18 v20 v22 v24 v26 v28 v30
+# C1  x8 v17 v19 v21 v23 v25 v27 v29 v31
+# temp0   v2 v10 v12 v14
+# temp1   v3 v11 v13 v15
+# unused  v8 v9
+
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal
+
+        # Clamp A and C pointers
+        CMP     x0, 2             // if mr < 2
+        STP     d10, d11, [sp, -48]!
+        ADD     x4, x3, x4        // a1 = a0 + a_stride
+        STP     d12, d13, [sp, 16]
+        ADD     x7, x6, x7        // c1 = c0 + cm_stride
+        STP     d14, d15, [sp, 32]
+        CSEL    x4, x3, x4, LO    //   a1 = a0
+        ADD     x2, x2, 15        // kc = (kc + 15) & ~15
+        CSEL    x7, x6, x7, LO    //   c1 = c0
+        BIC     x2, x2, 15
+
+        .p2align 3
+0:
+        # Load initial bias from w into accumulators
+        MOV     x0, x2   // k = kc
+        LDP     s16, s18, [x5], 8
+        MOV     v17.4s, v16.4s
+        MOV     v19.4s, v18.4s
+        LDP     s20, s22, [x5], 8
+        MOV     v21.4s, v20.4s
+        MOV     v23.4s, v22.4s
+        LDP     s24, s26, [x5], 8
+        MOV     v25.4s, v24.4s
+        MOV     v27.4s, v26.4s
+        LDP     s28, s30, [x5], 8
+        MOV     v29.4s, v28.4s
+        LDP     x10, x9, [sp, 48]  // cn_stride, params
+        MOV     v31.4s, v30.4s
+
+        # Main loop - 16 bytes of A
+        .p2align 3
+1:
+        LDR     q0, [x3], 16
+        LDP     q4, q5, [x5]
+        LDR     q1, [x4], 16
+        LDP     q6, q7, [x5, 32]
+        SMULL    v2.8h, v4.8b, v0.8b
+        SMULL    v3.8h, v4.8b, v1.8b
+        SMULL   v10.8h, v5.8b, v0.8b
+        SMULL   v11.8h, v5.8b, v1.8b
+        SMLAL2   v2.8h, v4.16b, v0.16b
+        SMLAL2   v3.8h, v4.16b, v1.16b
+        SMLAL2  v10.8h, v5.16b, v0.16b
+        SMLAL2  v11.8h, v5.16b, v1.16b
+        SMULL   v12.8h, v6.8b, v0.8b
+        SADALP  v16.4s,  v2.8h
+        SMULL   v13.8h, v6.8b, v1.8b
+        SADALP  v17.4s,  v3.8h
+        SMULL   v14.8h, v7.8b, v0.8b
+        SADALP  v18.4s, v10.8h
+        SMULL   v15.8h, v7.8b, v1.8b
+        SADALP  v19.4s, v11.8h
+        LDP     q4, q5, [x5, 64]
+        SMLAL2  v12.8h, v6.16b, v0.16b
+        SMLAL2  v13.8h, v6.16b, v1.16b
+        SMLAL2  v14.8h, v7.16b, v0.16b
+        SMLAL2  v15.8h, v7.16b, v1.16b
+        SMULL    v2.8h, v4.8b, v0.8b
+        SADALP  v20.4s, v12.8h
+        SMULL    v3.8h, v4.8b, v1.8b
+        SADALP  v21.4s, v13.8h
+        SMULL   v10.8h, v5.8b, v0.8b
+        SADALP  v22.4s, v14.8h
+        SMULL   v11.8h, v5.8b, v1.8b
+        SADALP  v23.4s, v15.8h
+        LDP     q6, q7, [x5, 96]
+
+        SMLAL2   v2.8h, v4.16b, v0.16b
+        SMLAL2   v3.8h, v4.16b, v1.16b
+        SMLAL2  v10.8h, v5.16b, v0.16b
+        SMLAL2  v11.8h, v5.16b, v1.16b
+        ADD     x5, x5, 128
+        SMULL   v12.8h, v6.8b, v0.8b
+        SADALP  v24.4s,  v2.8h
+        SMULL   v13.8h, v6.8b, v1.8b
+        SADALP  v25.4s,  v3.8h
+        SMULL   v14.8h, v7.8b, v0.8b
+        SADALP  v26.4s, v10.8h
+        SMULL   v15.8h, v7.8b, v1.8b
+        SADALP  v27.4s, v11.8h
+        SUBS    x0, x0, 16
+        SMLAL2  v12.8h, v6.16b, v0.16b
+        SMLAL2  v13.8h, v6.16b, v1.16b
+        SMLAL2  v14.8h, v7.16b, v0.16b
+        SMLAL2  v15.8h, v7.16b, v1.16b
+        SADALP  v28.4s, v12.8h
+        SADALP  v29.4s, v13.8h
+        SADALP  v30.4s, v14.8h
+        SADALP  v31.4s, v15.8h
+        B.HI    1b
+
+        # Add columns
+        ADDP    v16.4s, v16.4s, v18.4s
+        ADDP    v20.4s, v20.4s, v22.4s
+        LD1R    {v4.4s}, [x9], 4
+        ADDP    v24.4s, v24.4s, v26.4s
+        ADDP    v28.4s, v28.4s, v30.4s
+        LD1R    {v7.4s}, [x9], 4
+        ADDP    v17.4s, v17.4s, v19.4s
+        ADDP    v21.4s, v21.4s, v23.4s
+        ADDP    v25.4s, v25.4s, v27.4s
+        ADDP    v29.4s, v29.4s, v31.4s
+        ADDP    v0.4s, v16.4s, v20.4s
+        ADDP    v1.4s, v24.4s, v28.4s
+        ADDP    v2.4s, v17.4s, v21.4s
+        ADDP    v3.4s, v25.4s, v29.4s
+
+        # Apply params - scale, shift, bias and clamp
+        SQRDMULH        v0.4s, v0.4s, v4.4s
+        SQRDMULH        v1.4s, v1.4s, v4.4s
+        SQRDMULH        v2.4s, v2.4s, v4.4s
+        SQRDMULH        v3.4s, v3.4s, v4.4s
+        CMEQ    v4.4s, v7.4s, 0
+        LD1R    {v5.8h}, [x9], 2
+        BIC      v6.16b, v0.16b, v4.16b
+        BIC     v16.16b, v1.16b, v4.16b
+        BIC     v17.16b, v2.16b, v4.16b
+        BIC     v4.16b,  v3.16b, v4.16b
+        SSRA    v0.4s,  v6.4s, 31
+        SSRA    v1.4s, v16.4s, 31
+        SSRA    v2.4s, v17.4s, 31
+        SSRA    v3.4s,  v4.4s, 31
+        SRSHL   v0.4s, v0.4s, v7.4s
+        SRSHL   v1.4s, v1.4s, v7.4s
+        SRSHL   v2.4s, v2.4s, v7.4s
+        SRSHL   v3.4s, v3.4s, v7.4s
+        SQXTN   v0.4h, v0.4s
+        SQXTN   v2.4h, v2.4s
+        SQXTN2  v0.8h, v1.4s
+        SQXTN2  v2.8h, v3.4s
+        SUBS    x1, x1, 8
+        SQADD   v0.8h, v0.8h, v5.8h
+        SQADD   v1.8h, v2.8h, v5.8h
+        SQXTN   v0.8b, v0.8h
+        SQXTN2  v0.16b, v1.8h
+        LD1R    {v1.16b}, [x9], 1
+        LD1R    {v2.16b}, [x9]
+        SMAX    v0.16b, v0.16b, v1.16b
+        SMIN    v0.16b, v0.16b, v2.16b
+        B.LO    4f
+
+        # Store full 2 x 8
+        ST1     {v0.8b}, [x6], x10
+        SUB     x3, x3, x2     // a0 -= kc
+        ST1     {v0.d}[1], [x7], x10
+        SUB     x4, x4, x2     // a1 -= kc
+        B.HI    0b
+
+        # Restore d10-d15 from stack
+        LDP     d14, d15, [sp, 32]
+        LDP     d12, d13, [sp, 16]
+        LDP     d10, d11, [sp], 48
+        RET
+
+        # Store odd width
+        .p2align 3
+4:
+        TBZ     x1, 2, 5f
+        STR     s0, [x6], 4
+        ST1     {v0.s}[2], [x7], 4
+        EXT     v0.16b, v0.16b, v0.16b, 4
+
+5:
+        TBZ     x1, 1, 6f
+        ST1     {v0.h}[0], [x6], 2
+        ST1     {v0.h}[4], [x7], 2
+        EXT     v0.16b, v0.16b, v0.16b, 2
+6:
+        TBZ     x1, 0, 7f
+        ST1     {v0.b}[0], [x6]
+        ST1     {v0.b}[8], [x7]
+7:
+        # Restore d10-d15 from stack
+        LDP     d14, d15, [sp, 32]
+        LDP     d12, d13, [sp, 16]
+        LDP     d10, d11, [sp], 48
+        RET
+
+END_FUNCTION xnn_qs8_gemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
+
diff --git a/src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal.S b/src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal.S
new file mode 100644
index 0000000..4010678
--- /dev/null
+++ b/src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal.S
@@ -0,0 +1,354 @@
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack/assembly.h>
+
+# void xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal(
+#     size_t mr,                 x0
+#     size_t nc,                 x1
+#     size_t kc,                 x2 / x0
+#     const int8_t* restrict a,  x3
+#     size_t a_stride,           x4
+#     const void* restrict w,    x5
+#     int8_t* restrict c,        x6
+#     size_t cm_stride,          x7
+#     size_t cn_stride,          [sp] -> x10
+#     const union xnn_qs8_gemm_params params)  [sp + 8] -> x9
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0  x3  v0  v6
+# A1  x4  v1  v7
+# B   x5  v4  v5  v8 v9
+# C0  x7 v16 v18 v20 v22 v24 v26 v28 v30
+# C1  x8 v17 v19 v21 v23 v25 v27 v29 v31
+# temp0   v2 v10 v12 v14
+# temp1   v3 v11 v13 v15
+
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal
+
+        # Clamp A and C pointers
+        CMP     x0, 2             // if mr < 2
+        STP     d8, d9, [sp, -64]!
+        ADD     x4, x3, x4        // a1 = a0 + a_stride
+        STP     d10, d11, [sp, 16]
+        ADD     x7, x6, x7        // c1 = c0 + cm_stride
+        STP     d12, d13, [sp, 32]
+        CSEL    x4, x3, x4, LO    //   a1 = a0
+        STP     d14, d15, [sp, 48]
+        ADD     x2, x2, 7         // kc = (kc + 7) & ~7
+        CSEL    x7, x6, x7, LO    //   c1 = c0
+        BIC     x2, x2, 7
+
+        .p2align 3
+0:
+        # Load initial bias from w into accumulators
+        SUBS    x0, x2, 16         // k = kc - 16
+        LDP     s16, s18, [x5], 8
+        MOV     v17.4s, v16.4s
+        MOV     v19.4s, v18.4s
+        LDP     s20, s22, [x5], 8
+        MOV     v21.4s, v20.4s
+        MOV     v23.4s, v22.4s
+        LDP     s24, s26, [x5], 8
+        MOV     v25.4s, v24.4s
+        MOV     v27.4s, v26.4s
+        LDP     s28, s30, [x5], 8
+        MOV     v29.4s, v28.4s
+        LDP     x10, x9, [sp, 64]  // cn_stride, params
+        MOV     v31.4s, v30.4s
+        # Is there at least 16 bytes for epilogue?
+        B.LO    4f
+
+        # Prologue
+        LDP     d0, d6, [x3], 16   // Read A0
+        LDP     d4, d5, [x5]
+        LDP     d1, d7, [x4], 16
+        LDP     d8, d9, [x5, 64]
+        SMULL    v2.8h, v4.8b, v0.8b
+        SMULL    v3.8h, v4.8b, v1.8b
+        SUBS    x0, x0, 16         // k = kc - 16
+        SMULL   v10.8h, v5.8b, v0.8b
+        SMULL   v11.8h, v5.8b, v1.8b
+        LDP     d4, d5, [x5, 16]
+
+        # Is there at least 16 bytes for mainloop?
+        B.LO    2f
+
+        # Main loop - 16 bytes of A
+
+        .p2align 3
+1:
+        SMLAL    v2.8h, v8.8b, v6.8b
+        SMLAL    v3.8h, v8.8b, v7.8b
+        SMLAL   v10.8h, v9.8b, v6.8b
+        SMLAL   v11.8h, v9.8b, v7.8b
+
+        LDP     d8, d9, [x5, 80]
+        SMULL   v12.8h, v4.8b, v0.8b
+        SADALP  v16.4s,  v2.8h
+        SMULL   v13.8h, v4.8b, v1.8b
+        SADALP  v17.4s,  v3.8h
+        SMULL   v14.8h, v5.8b, v0.8b
+        SADALP  v18.4s, v10.8h
+        SMULL   v15.8h, v5.8b, v1.8b
+        SADALP  v19.4s, v11.8h
+        LDP     d4, d5, [x5, 32]
+        SMLAL   v12.8h, v8.8b, v6.8b
+        SMLAL   v13.8h, v8.8b, v7.8b
+        SMLAL   v14.8h, v9.8b, v6.8b
+        SMLAL   v15.8h, v9.8b, v7.8b
+
+        LDP     d8, d9, [x5, 96]
+        SMULL    v2.8h, v4.8b, v0.8b
+        SADALP  v20.4s, v12.8h
+        SMULL    v3.8h, v4.8b, v1.8b
+        SADALP  v21.4s, v13.8h
+        SMULL   v10.8h, v5.8b, v0.8b
+        SADALP  v22.4s, v14.8h
+        SMULL   v11.8h, v5.8b, v1.8b
+        SADALP  v23.4s, v15.8h
+        LDP     d4, d5, [x5, 48]
+        SMLAL    v2.8h, v8.8b, v6.8b
+        SMLAL    v3.8h, v8.8b, v7.8b
+        SMLAL   v10.8h, v9.8b, v6.8b
+        SMLAL   v11.8h, v9.8b, v7.8b
+
+        LDP     d8, d9, [x5, 112]
+        SMULL   v12.8h, v4.8b, v0.8b
+        SADALP  v24.4s,  v2.8h
+        SMULL   v13.8h, v4.8b, v1.8b
+        SADALP  v25.4s,  v3.8h
+        SMULL   v14.8h, v5.8b, v0.8b
+        SADALP  v26.4s, v10.8h
+        SMULL   v15.8h, v5.8b, v1.8b
+        SADALP  v27.4s, v11.8h
+        LDP     d4, d5, [x5, 128]  // Read B0
+        SMLAL   v12.8h, v8.8b, v6.8b
+        SMLAL   v13.8h, v8.8b, v7.8b
+        ADD     x5, x5, 128
+        SMLAL   v14.8h, v9.8b, v6.8b
+        LDP     d0, d6, [x3], 16   // Read A0
+        SMLAL   v15.8h, v9.8b, v7.8b
+
+# start of next loop
+        LDP     d1, d7, [x4], 16
+        LDP     d8, d9, [x5, 64]
+        SMULL    v2.8h, v4.8b, v0.8b
+        SADALP  v28.4s, v12.8h
+        SMULL    v3.8h, v4.8b, v1.8b
+        SADALP  v29.4s, v13.8h
+        SMULL   v10.8h, v5.8b, v0.8b
+        SADALP  v30.4s, v14.8h
+        SMULL   v11.8h, v5.8b, v1.8b
+        SUBS    x0, x0, 16
+        SADALP  v31.4s, v15.8h
+        LDP     d4, d5, [x5, 16]
+        B.HS    1b
+
+        # Epilogue loop - 16 bytes of A
+        # Same as main loop except no read a0, b0
+        .p2align 3
+2:
+        SMLAL    v2.8h, v8.8b, v6.8b
+        SMLAL    v3.8h, v8.8b, v7.8b
+        SMLAL   v10.8h, v9.8b, v6.8b
+        SMLAL   v11.8h, v9.8b, v7.8b
+
+        LDP     d8, d9, [x5, 80]
+        SMULL   v12.8h, v4.8b, v0.8b
+        SADALP  v16.4s,  v2.8h
+        SMULL   v13.8h, v4.8b, v1.8b
+        SADALP  v17.4s,  v3.8h
+        SMULL   v14.8h, v5.8b, v0.8b
+        SADALP  v18.4s, v10.8h
+        SMULL   v15.8h, v5.8b, v1.8b
+        SADALP  v19.4s, v11.8h
+        LDP     d4, d5, [x5, 32]
+        SMLAL   v12.8h, v8.8b, v6.8b
+        SMLAL   v13.8h, v8.8b, v7.8b
+        SMLAL   v14.8h, v9.8b, v6.8b
+        SMLAL   v15.8h, v9.8b, v7.8b
+
+        LDP     d8, d9, [x5, 96]
+        SMULL    v2.8h, v4.8b, v0.8b
+        SADALP  v20.4s, v12.8h
+        SMULL    v3.8h, v4.8b, v1.8b
+        SADALP  v21.4s, v13.8h
+        SMULL   v10.8h, v5.8b, v0.8b
+        SADALP  v22.4s, v14.8h
+        SMULL   v11.8h, v5.8b, v1.8b
+        SADALP  v23.4s, v15.8h
+        LDP     d4, d5, [x5, 48]
+        SMLAL    v2.8h, v8.8b, v6.8b
+        SMLAL    v3.8h, v8.8b, v7.8b
+        SMLAL   v10.8h, v9.8b, v6.8b
+        SMLAL   v11.8h, v9.8b, v7.8b
+
+        LDP     d8, d9, [x5, 112]
+        SMULL   v12.8h, v4.8b, v0.8b
+        SADALP  v24.4s,  v2.8h
+        SMULL   v13.8h, v4.8b, v1.8b
+        SADALP  v25.4s,  v3.8h
+        SMULL   v14.8h, v5.8b, v0.8b
+        SADALP  v26.4s, v10.8h
+        SMULL   v15.8h, v5.8b, v1.8b
+        SADALP  v27.4s, v11.8h
+        SMLAL   v12.8h, v8.8b, v6.8b
+        SMLAL   v13.8h, v8.8b, v7.8b
+        SMLAL   v14.8h, v9.8b, v6.8b
+        SMLAL   v15.8h, v9.8b, v7.8b
+        ADD     x5, x5, 128
+
+        SADALP  v28.4s, v12.8h
+        SADALP  v29.4s, v13.8h
+        SADALP  v30.4s, v14.8h
+        SADALP  v31.4s, v15.8h
+
+        # Is there a remainder?- 8 bytes of A
+        TBNZ    x0, 3, 4f
+
+        .p2align 3
+3:
+        # Add columns
+        ADDP    v16.4s, v16.4s, v18.4s
+        ADDP    v20.4s, v20.4s, v22.4s
+        LD1R    {v4.4s}, [x9], 4
+        ADDP    v24.4s, v24.4s, v26.4s
+        ADDP    v28.4s, v28.4s, v30.4s
+        LD1R    {v7.4s}, [x9], 4
+        ADDP    v17.4s, v17.4s, v19.4s
+        ADDP    v21.4s, v21.4s, v23.4s
+        ADDP    v25.4s, v25.4s, v27.4s
+        ADDP    v29.4s, v29.4s, v31.4s
+        ADDP    v0.4s, v16.4s, v20.4s
+        ADDP    v1.4s, v24.4s, v28.4s
+        ADDP    v2.4s, v17.4s, v21.4s
+        ADDP    v3.4s, v25.4s, v29.4s
+
+        # Apply params - scale, shift, bias and clamp
+        SQRDMULH        v0.4s, v0.4s, v4.4s
+        SQRDMULH        v1.4s, v1.4s, v4.4s
+        SQRDMULH        v2.4s, v2.4s, v4.4s
+        SQRDMULH        v3.4s, v3.4s, v4.4s
+        CMEQ    v4.4s, v7.4s, 0
+        LD1R    {v5.8h}, [x9], 2
+        BIC      v6.16b, v0.16b, v4.16b
+        BIC     v16.16b, v1.16b, v4.16b
+        BIC     v17.16b, v2.16b, v4.16b
+        BIC     v4.16b,  v3.16b, v4.16b
+        SSRA    v0.4s,  v6.4s, 31
+        SSRA    v1.4s, v16.4s, 31
+        SSRA    v2.4s, v17.4s, 31
+        SSRA    v3.4s,  v4.4s, 31
+        SRSHL   v0.4s, v0.4s, v7.4s
+        SRSHL   v1.4s, v1.4s, v7.4s
+        SRSHL   v2.4s, v2.4s, v7.4s
+        SRSHL   v3.4s, v3.4s, v7.4s
+        SQXTN   v0.4h, v0.4s
+        SQXTN   v2.4h, v2.4s
+        SQXTN2  v0.8h, v1.4s
+        SQXTN2  v2.8h, v3.4s
+        SUBS    x1, x1, 8
+        SQADD   v0.8h, v0.8h, v5.8h
+        SQADD   v1.8h, v2.8h, v5.8h
+        SQXTN   v0.8b, v0.8h
+        SQXTN2  v0.16b, v1.8h
+        LD1R    {v1.16b}, [x9], 1
+        LD1R    {v2.16b}, [x9]
+        SMAX    v0.16b, v0.16b, v1.16b
+        SMIN    v0.16b, v0.16b, v2.16b
+        B.LO    5f
+
+        # Store full 2 x 8
+        ST1     {v0.8b}, [x6], x10
+        SUB     x3, x3, x2     // a0 -= kc
+        ST1     {v0.d}[1], [x7], x10
+        SUB     x4, x4, x2     // a1 -= kc
+        B.HI    0b
+
+        # Restore d8-d15 from stack
+        LDP     d14, d15, [sp, 48]
+        LDP     d12, d13, [sp, 32]
+        LDP     d10, d11, [sp, 16]
+        LDP     d8, d9, [sp], 64
+        RET
+
+        # Remainder - 8 bytes of A
+        .p2align 3
+4:
+        LDR     d0, [x3], 8
+        LDP     d4, d5, [x5]
+        LDR     d1, [x4], 8
+        LDP     d6, d7, [x5, 16]
+        SMULL    v2.8h, v4.8b, v0.8b
+        SMULL    v3.8h, v4.8b, v1.8b
+        SMULL   v10.8h, v5.8b, v0.8b
+        SMULL   v11.8h, v5.8b, v1.8b
+        SMULL   v12.8h, v6.8b, v0.8b
+        SADALP  v16.4s,  v2.8h
+        SMULL   v13.8h, v6.8b, v1.8b
+        SADALP  v17.4s,  v3.8h
+        SMULL   v14.8h, v7.8b, v0.8b
+        SADALP  v18.4s, v10.8h
+        SMULL   v15.8h, v7.8b, v1.8b
+        SADALP  v19.4s, v11.8h
+        LDP     d4, d5, [x5, 32]
+        SMULL    v2.8h, v4.8b, v0.8b
+        SADALP  v20.4s, v12.8h
+        SMULL    v3.8h, v4.8b, v1.8b
+        SADALP  v21.4s, v13.8h
+        SMULL   v10.8h, v5.8b, v0.8b
+        SADALP  v22.4s, v14.8h
+        SMULL   v11.8h, v5.8b, v1.8b
+        SADALP  v23.4s, v15.8h
+        LDP     d6, d7, [x5, 48]
+        SMULL   v12.8h, v6.8b, v0.8b
+        SADALP  v24.4s,  v2.8h
+        SMULL   v13.8h, v6.8b, v1.8b
+        SADALP  v25.4s,  v3.8h
+        SMULL   v14.8h, v7.8b, v0.8b
+        SADALP  v26.4s, v10.8h
+        SMULL   v15.8h, v7.8b, v1.8b
+        SADALP  v27.4s, v11.8h
+        ADD     x5, x5, 64
+        SADALP  v28.4s, v12.8h
+        SADALP  v29.4s, v13.8h
+        SADALP  v30.4s, v14.8h
+        SADALP  v31.4s, v15.8h
+        B       3b
+
+        # Store odd width
+        .p2align 3
+5:
+        TBZ     x1, 2, 6f
+        STR     s0, [x6], 4
+        ST1     {v0.s}[2], [x7], 4
+        EXT     v0.16b, v0.16b, v0.16b, 4
+
+6:
+        TBZ     x1, 1, 7f
+        ST1     {v0.h}[0], [x6], 2
+        ST1     {v0.h}[4], [x7], 2
+        EXT     v0.16b, v0.16b, v0.16b, 2
+7:
+        TBZ     x1, 0, 8f
+        ST1     {v0.b}[0], [x6]
+        ST1     {v0.b}[8], [x7]
+8:
+        # Restore d8-d15 from stack
+        LDP     d14, d15, [sp, 48]
+        LDP     d12, d13, [sp, 32]
+        LDP     d10, d11, [sp, 16]
+        LDP     d8, d9, [sp], 64
+        RET
+
+END_FUNCTION xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
+
diff --git a/src/qs8-gemm/2x8c8-aarch64-neon-mull-padal.S b/src/qs8-gemm/2x8c8-aarch64-neon-mull-padal.S
new file mode 100644
index 0000000..c472364
--- /dev/null
+++ b/src/qs8-gemm/2x8c8-aarch64-neon-mull-padal.S
@@ -0,0 +1,201 @@
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack/assembly.h>
+
+# void xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mull_padal(
+#     size_t mr,                 x0
+#     size_t nc,                 x1
+#     size_t kc,                 x2 / x0
+#     const int8_t* restrict a,  x3
+#     size_t a_stride,           x4
+#     const void* restrict w,    x5
+#     int8_t* restrict c,        x6
+#     size_t cm_stride,          x7
+#     size_t cn_stride,          [sp] -> x10
+#     const union xnn_qs8_gemm_params params)  [sp + 8] -> x9
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0  x3  v0
+# A1  x4  v1
+# B   x5  v4  v5  v6  v7
+# C0  x7 v16 v18 v20 v22 v24 v26 v28 v30
+# C1  x8 v17 v19 v21 v23 v25 v27 v29 v31
+# temp0   v2 v10 v12 v14
+# temp1   v3 v11 v13 v15
+# unused  v8 v9
+
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mull_padal
+
+        # Clamp A and C pointers
+        CMP     x0, 2             // if mr < 2
+        STP     d10, d11, [sp, -48]!
+        ADD     x4, x3, x4        // a1 = a0 + a_stride
+        STP     d12, d13, [sp, 16]
+        ADD     x7, x6, x7        // c1 = c0 + cm_stride
+        STP     d14, d15, [sp, 32]
+        CSEL    x4, x3, x4, LO    //   a1 = a0
+        ADD     x2, x2, 7         // kc = (kc + 7) & ~7
+        CSEL    x7, x6, x7, LO    //   c1 = c0
+        BIC     x2, x2, 7
+
+        .p2align 3
+0:
+        # Load initial bias from w into accumulators
+        MOV     x0, x2   // k = kc
+        LDP     s16, s18, [x5], 8
+        MOV     v17.4s, v16.4s
+        MOV     v19.4s, v18.4s
+        LDP     s20, s22, [x5], 8
+        MOV     v21.4s, v20.4s
+        MOV     v23.4s, v22.4s
+        LDP     s24, s26, [x5], 8
+        MOV     v25.4s, v24.4s
+        MOV     v27.4s, v26.4s
+        LDP     s28, s30, [x5], 8
+        MOV     v29.4s, v28.4s
+        LDP     x10, x9, [sp, 48]  // cn_stride, params
+        MOV     v31.4s, v30.4s
+
+        # Main loop - 8 bytes of A
+        .p2align 3
+1:
+        LDR     d0, [x3], 8
+        LDP     d4, d5, [x5]
+        LDR     d1, [x4], 8
+        LDP     d6, d7, [x5, 16]
+        SMULL    v2.8h, v4.8b, v0.8b
+        SMULL    v3.8h, v4.8b, v1.8b
+        SMULL   v10.8h, v5.8b, v0.8b
+        SMULL   v11.8h, v5.8b, v1.8b
+        SMULL   v12.8h, v6.8b, v0.8b
+        SADALP  v16.4s,  v2.8h
+        SMULL   v13.8h, v6.8b, v1.8b
+        SADALP  v17.4s,  v3.8h
+        SMULL   v14.8h, v7.8b, v0.8b
+        SADALP  v18.4s, v10.8h
+        SMULL   v15.8h, v7.8b, v1.8b
+        SADALP  v19.4s, v11.8h
+        LDP     d4, d5, [x5, 32]
+        SMULL    v2.8h, v4.8b, v0.8b
+        SADALP  v20.4s, v12.8h
+        SMULL    v3.8h, v4.8b, v1.8b
+        SADALP  v21.4s, v13.8h
+        SMULL   v10.8h, v5.8b, v0.8b
+        SADALP  v22.4s, v14.8h
+        SMULL   v11.8h, v5.8b, v1.8b
+        SADALP  v23.4s, v15.8h
+        LDP     d6, d7, [x5, 48]
+        SMULL   v12.8h, v6.8b, v0.8b
+        SADALP  v24.4s,  v2.8h
+        SMULL   v13.8h, v6.8b, v1.8b
+        SADALP  v25.4s,  v3.8h
+        SMULL   v14.8h, v7.8b, v0.8b
+        SADALP  v26.4s, v10.8h
+        SMULL   v15.8h, v7.8b, v1.8b
+        SADALP  v27.4s, v11.8h
+        ADD     x5, x5, 64
+        SADALP  v28.4s, v12.8h
+        SADALP  v29.4s, v13.8h
+        SUBS    x0, x0, 8
+        SADALP  v30.4s, v14.8h
+        SADALP  v31.4s, v15.8h
+        B.HI    1b
+
+        # Add columns
+        ADDP    v16.4s, v16.4s, v18.4s
+        ADDP    v20.4s, v20.4s, v22.4s
+        LD1R    {v4.4s}, [x9], 4
+        ADDP    v24.4s, v24.4s, v26.4s
+        ADDP    v28.4s, v28.4s, v30.4s
+        LD1R    {v7.4s}, [x9], 4
+        ADDP    v17.4s, v17.4s, v19.4s
+        ADDP    v21.4s, v21.4s, v23.4s
+        ADDP    v25.4s, v25.4s, v27.4s
+        ADDP    v29.4s, v29.4s, v31.4s
+        ADDP    v0.4s, v16.4s, v20.4s
+        ADDP    v1.4s, v24.4s, v28.4s
+        ADDP    v2.4s, v17.4s, v21.4s
+        ADDP    v3.4s, v25.4s, v29.4s
+
+        # Apply params - scale, shift, bias and clamp
+        SQRDMULH        v0.4s, v0.4s, v4.4s
+        SQRDMULH        v1.4s, v1.4s, v4.4s
+        SQRDMULH        v2.4s, v2.4s, v4.4s
+        SQRDMULH        v3.4s, v3.4s, v4.4s
+        CMEQ    v4.4s, v7.4s, 0
+        LD1R    {v5.8h}, [x9], 2
+        BIC      v6.16b, v0.16b, v4.16b
+        BIC     v16.16b, v1.16b, v4.16b
+        BIC     v17.16b, v2.16b, v4.16b
+        BIC     v4.16b,  v3.16b, v4.16b
+        SSRA    v0.4s,  v6.4s, 31
+        SSRA    v1.4s, v16.4s, 31
+        SSRA    v2.4s, v17.4s, 31
+        SSRA    v3.4s,  v4.4s, 31
+        SRSHL   v0.4s, v0.4s, v7.4s
+        SRSHL   v1.4s, v1.4s, v7.4s
+        SRSHL   v2.4s, v2.4s, v7.4s
+        SRSHL   v3.4s, v3.4s, v7.4s
+        SQXTN   v0.4h, v0.4s
+        SQXTN   v2.4h, v2.4s
+        SQXTN2  v0.8h, v1.4s
+        SQXTN2  v2.8h, v3.4s
+        SUBS    x1, x1, 8
+        SQADD   v0.8h, v0.8h, v5.8h
+        SQADD   v1.8h, v2.8h, v5.8h
+        SQXTN   v0.8b, v0.8h
+        SQXTN2  v0.16b, v1.8h
+        LD1R    {v1.16b}, [x9], 1
+        LD1R    {v2.16b}, [x9]
+        SMAX    v0.16b, v0.16b, v1.16b
+        SMIN    v0.16b, v0.16b, v2.16b
+        B.LO    2f
+
+        # Store full 2 x 8
+        ST1     {v0.8b}, [x6], x10
+        SUB     x3, x3, x2     // a0 -= kc
+        ST1     {v0.d}[1], [x7], x10
+        SUB     x4, x4, x2     // a1 -= kc
+        B.HI    0b
+
+        # Restore d10-d15 from stack
+        LDP     d14, d15, [sp, 32]
+        LDP     d12, d13, [sp, 16]
+        LDP     d10, d11, [sp], 48
+        RET
+
+        # Store odd width
+        .p2align 3
+2:
+        TBZ     x1, 2, 3f
+        STR     s0, [x6], 4
+        ST1     {v0.s}[2], [x7], 4
+        EXT     v0.16b, v0.16b, v0.16b, 4
+
+3:
+        TBZ     x1, 1, 4f
+        ST1     {v0.h}[0], [x6], 2
+        ST1     {v0.h}[4], [x7], 2
+        EXT     v0.16b, v0.16b, v0.16b, 2
+4:
+        TBZ     x1, 0, 5f
+        ST1     {v0.b}[0], [x6]
+        ST1     {v0.b}[8], [x7]
+5:
+        # Restore d10-d15 from stack
+        LDP     d14, d15, [sp, 32]
+        LDP     d12, d13, [sp, 16]
+        LDP     d10, d11, [sp], 48
+        RET
+
+END_FUNCTION xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mull_padal
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
+
diff --git a/src/qs8-gemm/4x16c4-aarch64-neondot-cortex-a55.S b/src/qs8-gemm/4x16c4-aarch64-neondot-cortex-a55.S
index 673bd29..16aee9a 100644
--- a/src/qs8-gemm/4x16c4-aarch64-neondot-cortex-a55.S
+++ b/src/qs8-gemm/4x16c4-aarch64-neondot-cortex-a55.S
@@ -8,7 +8,7 @@
 # void xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
-#     size_t kc,                 x2 / x0 / x10
+#     size_t kc,                 x2 / x0
 #     const int8_t* restrict a,  x3
 #     size_t a_stride,           x4
 #     const void* restrict w,    x5
@@ -35,22 +35,24 @@
 
         # Clamp A and C pointers
         CMP      x0, 2             // if mr < 2
+        STP      d8,  d9, [sp, -32]!
         ADD     x15, x3, x4        // a1 = a0 + a_stride
         ADD      x8, x6, x7        // c1 = c0 + cm_stride
-        STP      d8,  d9, [sp, -32]!
+        STP     d10, d11, [sp, 16]
         CSEL    x15, x3, x15, LO   //   a1 = a0
         CSEL     x8, x6,  x8, LO   //   c1 = c0
+        ADD      x2, x2, 3         // kc = (kc + 3) & ~3
 
         ADD     x13, x15, x4       // a2 = a1 + a_stride
         ADD      x9,  x8, x7       // c2 = c1 + cm_stride
                                    // if mr <= 2
         CSEL    x13, x15, x13, LS  //   a2 = a1
         CSEL     x9,  x8,  x9, LS  //   c2 = c1
+        BIC      x2, x2, 3
 
         CMP      x0, 4             // if mr < 4
         ADD      x4, x13, x4       // a3 = a2 + a_stride
         ADD      x7,  x9, x7       // c3 = c2 + cm_stride
-        STP     d10, d11, [sp, 16]
         CSEL     x4, x13, x4, LO   //   a3 = a2
         CSEL     x7,  x9, x7, LO   //   c3 = c2
 
@@ -58,19 +60,18 @@
 0:
         # Load initial bias from w into accumulators
         LDP     q16, q20, [x5], 32
-        SUBS    x0, x2, 16         // k = kc - 16
         MOV     v17.16b, v16.16b
         MOV     v18.16b, v16.16b
         LDP     q24, q28, [x5], 32
         MOV     v19.16b, v16.16b
         MOV     v21.16b, v20.16b
+        LDR     x11, [sp, 40]      // params
         MOV     v22.16b, v20.16b
         MOV     v23.16b, v20.16b
-        LDR     x11, [sp, 40]      // params
         MOV     v25.16b, v24.16b
         MOV     v26.16b, v24.16b
-        AND     x10, x2, 15        // kc remainder 0 to 15
         MOV     v27.16b, v24.16b
+        SUBS    x0, x2, 16         // k = kc - 16
         MOV     v29.16b, v28.16b
         MOV     v30.16b, v28.16b
         MOV     v31.16b, v28.16b
@@ -389,14 +390,16 @@
         INS     v11.d[1], x14
         SDOT    v26.4s, v10.16b, v6.4b[1]
         SDOT    v27.4s, v10.16b, v7.4b[1]
+        AND     x0, x2, 15        // kc remainder 0 to 12
 
         // BLOCK 3
         SDOT    v28.4s, v11.16b, v4.4b[1]
         SDOT    v29.4s, v11.16b, v5.4b[1]
         SDOT    v30.4s, v11.16b, v6.4b[1]
         SDOT    v31.4s, v11.16b, v7.4b[1]
-        # Is there a remainder?- 1 to 15 bytes of A
-        CBNZ    x10, 4f
+
+        # Is there a remainder?- 4 to 12 bytes of A
+        CBNZ    x0, 5f
 
         .p2align 3
 3:
@@ -525,7 +528,7 @@
         SMIN     v5.16b,  v5.16b, v1.16b
         SMIN     v6.16b,  v6.16b, v1.16b
         SMIN     v7.16b,  v7.16b, v1.16b
-        B.LO    5f
+        B.LO    6f
 
         # Store full 4 x 16
         ST1     {v4.16b}, [x6], x12
@@ -542,15 +545,18 @@
         LDP      d8,  d9, [sp], 32
         RET
 
-        # Remainder- 1 to 15 bytes of A
+        # Remainder- 4 to 12 bytes of A
+        # Although C4, its safe to read 16 bytes.
         .p2align 3
 4:
+        AND     x0, x2, 15        // kc remainder 4 to 12
+5:
         LDP      q8,  q9,  [x5], 32
         LDP     q10, q11,  [x5], 32
-        LD1     {v0.16b},  [x3], x10
-        LD1     {v1.16b}, [x15], x10
-        LD1     {v2.16b}, [x13], x10
-        LD1     {v3.16b},  [x4], x10
+        LD1     {v0.16b},  [x3], x0
+        LD1     {v1.16b}, [x15], x0
+        LD1     {v2.16b}, [x13], x0
+        LD1     {v3.16b},  [x4], x0
         SDOT    v16.4s,  v8.16b, v0.4b[0]
         SDOT    v17.4s,  v8.16b, v1.4b[0]
         SDOT    v18.4s,  v8.16b, v2.4b[0]
@@ -567,7 +573,7 @@
         SDOT    v29.4s, v11.16b, v1.4b[0]
         SDOT    v30.4s, v11.16b, v2.4b[0]
         SDOT    v31.4s, v11.16b, v3.4b[0]
-        CMP     x10, 4
+        CMP     x0, 4
         B.LS    3b
         LDP      q8,  q9,  [x5], 32
         LDP     q10, q11,  [x5], 32
@@ -587,7 +593,7 @@
         SDOT    v29.4s, v11.16b, v1.4b[1]
         SDOT    v30.4s, v11.16b, v2.4b[1]
         SDOT    v31.4s, v11.16b, v3.4b[1]
-        CMP     x10, 8
+        CMP     x0, 8
         B.LS    3b
         LDP       q8,  q9,  [x5], 32
         LDP      q10, q11,  [x5], 32
@@ -607,32 +613,12 @@
         SDOT    v29.4s, v11.16b, v1.4b[2]
         SDOT    v30.4s, v11.16b, v2.4b[2]
         SDOT    v31.4s, v11.16b, v3.4b[2]
-        CMP     x10, 12
-        B.LS    3b
-        LDP       q8,   q9, [x5], 32
-        LDP      q10,  q11, [x5], 32
-        SDOT    v16.4s,  v8.16b, v0.4b[3]
-        SDOT    v17.4s,  v8.16b, v1.4b[3]
-        SDOT    v18.4s,  v8.16b, v2.4b[3]
-        SDOT    v19.4s,  v8.16b, v3.4b[3]
-        SDOT    v20.4s,  v9.16b, v0.4b[3]
-        SDOT    v21.4s,  v9.16b, v1.4b[3]
-        SDOT    v22.4s,  v9.16b, v2.4b[3]
-        SDOT    v23.4s,  v9.16b, v3.4b[3]
-        SDOT    v24.4s, v10.16b, v0.4b[3]
-        SDOT    v25.4s, v10.16b, v1.4b[3]
-        SDOT    v26.4s, v10.16b, v2.4b[3]
-        SDOT    v27.4s, v10.16b, v3.4b[3]
-        SDOT    v28.4s, v11.16b, v0.4b[3]
-        SDOT    v29.4s, v11.16b, v1.4b[3]
-        SDOT    v30.4s, v11.16b, v2.4b[3]
-        SDOT    v31.4s, v11.16b, v3.4b[3]
         B       3b
 
         # Store odd width
         .p2align 3
-5:
-        TBZ     x1, 3, 6f
+6:
+        TBZ     x1, 3, 7f
         STR     d4, [x6], 8
         DUP     d4, v4.d[1]
         STR     d5, [x8], 8
@@ -641,8 +627,8 @@
         DUP     d6, v6.d[1]
         STR     d7, [x7], 8
         DUP     d7, v7.d[1]
-6:
-        TBZ     x1, 2, 7f
+7:
+        TBZ     x1, 2, 8f
         STR     s4, [x6], 4
         DUP     s4, v4.s[1]
         STR     s5, [x8], 4
@@ -651,8 +637,8 @@
         DUP     s6, v6.s[1]
         STR     s7, [x7], 4
         DUP     s7, v7.s[1]
-7:
-        TBZ     x1, 1, 8f
+8:
+        TBZ     x1, 1, 9f
         ST1     {v4.h}[0], [x6], 2
         DUP      h4, v4.h[1]
         ST1     {v5.h}[0], [x8], 2
@@ -661,13 +647,13 @@
         DUP      h6, v6.h[1]
         ST1     {v7.h}[0], [x7], 2
         DUP      h7, v7.h[1]
-8:
-        TBZ     x1, 0, 9f
+9:
+        TBZ     x1, 0, 10f
         ST1     {v4.b}[0], [x6]
         ST1     {v5.b}[0], [x8]
         ST1     {v6.b}[0], [x9]
         ST1     {v7.b}[0], [x7]
-9:
+10:
         LDP     d10, d11, [sp, 16]
         LDP      d8,  d9, [sp], 32
         RET
diff --git a/src/qs8-gemm/4x16c4-aarch64-neondot-ld32.S b/src/qs8-gemm/4x16c4-aarch64-neondot-ld32.S
new file mode 100644
index 0000000..ccdfa9c
--- /dev/null
+++ b/src/qs8-gemm/4x16c4-aarch64-neondot-ld32.S
@@ -0,0 +1,296 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack/assembly.h>
+
+# void xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld32(
+#     size_t mr,                 x0
+#     size_t nc,                 x1
+#     size_t kc,                 x2 / x0
+#     const int8_t* restrict a,  x3
+#     size_t a_stride,           x4
+#     const void* restrict w,    x5
+#     int8_t* restrict c,        x6
+#     size_t cm_stride,          x7
+#     size_t cn_stride,          [sp] -> x12
+#     const union xnn_qs8_gemm_params params)  [sp + 8] -> x11
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0  x3 v0
+# A1 x15 v1
+# A2 x13 v2
+# A3  x4 v3
+# B   x5 v4  v5  v6  v7
+# C0  x6 v16 v20 v24 v28
+# C1  x8 v17 v21 v25 v29
+# C2  x9 v18 v22 v26 v30
+# C3  x7 v19 v23 v27 v31
+# unused v8 v9 v10 v11 v12 v13 v14 v15
+
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld32
+
+        # Clamp A and C pointers
+        CMP      x0, 2             // if mr < 2
+        ADD      x2, x2, 3         // kc = (kc + 3) & ~3
+        ADD     x15, x3, x4        // a1 = a0 + a_stride
+        ADD      x8, x6, x7        // c1 = c0 + cm_stride
+        CSEL    x15, x3, x15, LO   //   a1 = a0
+        CSEL     x8, x6,  x8, LO   //   c1 = c0
+        BIC      x2, x2, 3
+
+        ADD     x13, x15, x4       // a2 = a1 + a_stride
+        ADD      x9,  x8, x7       // c2 = c1 + cm_stride
+                                   // if mr <= 2
+        CSEL    x13, x15, x13, LS  //   a2 = a1
+        CSEL     x9,  x8,  x9, LS  //   c2 = c1
+
+        CMP      x0, 4             // if mr < 4
+        ADD      x4, x13, x4       // a3 = a2 + a_stride
+        ADD      x7,  x9, x7       // c3 = c2 + cm_stride
+        CSEL     x4, x13, x4, LO   //   a3 = a2
+        CSEL     x7,  x9, x7, LO   //   c3 = c2
+
+        .p2align 3
+0:
+        # Load initial bias from w into accumulators
+        LDP     q16, q20, [x5], 32
+        MOV     v17.16b, v16.16b
+        MOV     v18.16b, v16.16b
+        LDP     q24, q28, [x5], 32
+        MOV     v19.16b, v16.16b
+        MOV     v21.16b, v20.16b
+        LDR     x11, [sp, 8]       // params
+        MOV     v22.16b, v20.16b
+        MOV     v23.16b, v20.16b
+        MOV     x0, x2             // k = kc.  assumes kc > 0
+        MOV     v25.16b, v24.16b
+        MOV     v26.16b, v24.16b
+        MOV     v27.16b, v24.16b
+        MOV     v29.16b, v28.16b
+        MOV     v30.16b, v28.16b
+        MOV     v31.16b, v28.16b
+
+        # Main loop - 4 bytes of A
+        .p2align 3
+1:
+        LDR     s0,  [x3], 4
+        LDR     q4, [x5], 16
+        LDR     s1, [x15], 4
+        LDR     s2, [x13], 4
+        LDR     s3,  [x4], 4
+        SDOT    v16.4s, v4.16b, v0.4b[0]
+        SDOT    v17.4s, v4.16b, v1.4b[0]
+        LDR     q5, [x5], 16
+        SDOT    v18.4s, v4.16b, v2.4b[0]
+        SDOT    v19.4s, v4.16b, v3.4b[0]
+        LDR     q6, [x5], 16
+        SDOT    v20.4s, v5.16b, v0.4b[0]
+        SDOT    v21.4s, v5.16b, v1.4b[0]
+        LDR     q7, [x5], 16
+        SDOT    v22.4s, v5.16b, v2.4b[0]
+        SDOT    v23.4s, v5.16b, v3.4b[0]
+        SUBS    x0, x0, 4
+        SDOT    v24.4s, v6.16b, v0.4b[0]
+        SDOT    v25.4s, v6.16b, v1.4b[0]
+        SDOT    v26.4s, v6.16b, v2.4b[0]
+        SDOT    v27.4s, v6.16b, v3.4b[0]
+        SDOT    v28.4s, v7.16b, v0.4b[0]
+        SDOT    v29.4s, v7.16b, v1.4b[0]
+        SDOT    v30.4s, v7.16b, v2.4b[0]
+        SDOT    v31.4s, v7.16b, v3.4b[0]
+        B.HI    1b
+
+        # Apply params - scale, shift, bias and clamp
+        LD2R    {v0.4s, v1.4s}, [x11], 8
+        CMEQ    v2.4s, v1.4s, 0
+
+        BIC     v4.16b, v16.16b, v2.16b
+        BIC     v5.16b, v17.16b, v2.16b
+        BIC     v6.16b, v18.16b, v2.16b
+        BIC     v7.16b, v19.16b, v2.16b
+
+        SQRDMULH  v16.4s, v16.4s, v0.4s
+        SQRDMULH  v17.4s, v17.4s, v0.4s
+        SQRDMULH  v18.4s, v18.4s, v0.4s
+        SQRDMULH  v19.4s, v19.4s, v0.4s
+
+        SSRA    v16.4s, v4.4s, 31  // signed shift right accumulate
+        SSRA    v17.4s, v5.4s, 31
+        SSRA    v18.4s, v6.4s, 31
+        SSRA    v19.4s, v7.4s, 31
+
+        BIC     v4.16b, v20.16b, v2.16b
+        BIC     v5.16b, v21.16b, v2.16b
+        BIC     v6.16b, v22.16b, v2.16b
+        BIC     v7.16b, v23.16b, v2.16b
+
+        SQRDMULH  v20.4s, v20.4s, v0.4s
+        SQRDMULH  v21.4s, v21.4s, v0.4s
+        SQRDMULH  v22.4s, v22.4s, v0.4s
+        SQRDMULH  v23.4s, v23.4s, v0.4s
+
+        SSRA    v20.4s, v4.4s, 31
+        SSRA    v21.4s, v5.4s, 31
+        SSRA    v22.4s, v6.4s, 31
+        SSRA    v23.4s, v7.4s, 31
+
+        BIC     v4.16b, v24.16b, v2.16b
+        BIC     v5.16b, v25.16b, v2.16b
+        BIC     v6.16b, v26.16b, v2.16b
+        BIC     v7.16b, v27.16b, v2.16b
+
+        SQRDMULH  v24.4s, v24.4s, v0.4s
+        SQRDMULH  v25.4s, v25.4s, v0.4s
+        SQRDMULH  v26.4s, v26.4s, v0.4s
+        SQRDMULH  v27.4s, v27.4s, v0.4s
+
+        SSRA    v24.4s, v4.4s, 31
+        SSRA    v25.4s, v5.4s, 31
+        SSRA    v26.4s, v6.4s, 31
+        SSRA    v27.4s, v7.4s, 31
+
+        BIC     v4.16b, v28.16b, v2.16b
+        BIC     v5.16b, v29.16b, v2.16b
+        BIC     v6.16b, v30.16b, v2.16b
+        BIC     v7.16b, v31.16b, v2.16b
+
+        SQRDMULH  v28.4s, v28.4s, v0.4s
+        SQRDMULH  v29.4s, v29.4s, v0.4s
+        SQRDMULH  v30.4s, v30.4s, v0.4s
+        SQRDMULH  v31.4s, v31.4s, v0.4s
+
+        SSRA    v28.4s, v4.4s, 31
+        SSRA    v29.4s, v5.4s, 31
+        SSRA    v30.4s, v6.4s, 31
+        SSRA    v31.4s, v7.4s, 31
+
+        SRSHL   v16.4s, v16.4s, v1.4s  // signed rounding shift left
+        SRSHL   v17.4s, v17.4s, v1.4s
+        SRSHL   v18.4s, v18.4s, v1.4s
+        SRSHL   v19.4s, v19.4s, v1.4s
+        SRSHL   v20.4s, v20.4s, v1.4s
+        SRSHL   v21.4s, v21.4s, v1.4s
+        SRSHL   v22.4s, v22.4s, v1.4s
+        SRSHL   v23.4s, v23.4s, v1.4s
+        SRSHL   v24.4s, v24.4s, v1.4s
+        SRSHL   v25.4s, v25.4s, v1.4s
+        SRSHL   v26.4s, v26.4s, v1.4s
+        SRSHL   v27.4s, v27.4s, v1.4s
+        SRSHL   v28.4s, v28.4s, v1.4s
+        SRSHL   v29.4s, v29.4s, v1.4s
+        SRSHL   v30.4s, v30.4s, v1.4s
+        SRSHL   v31.4s, v31.4s, v1.4s
+
+        SQXTN   v16.4h, v16.4s
+        SQXTN   v17.4h, v17.4s
+        SQXTN   v18.4h, v18.4s
+        SQXTN   v19.4h, v19.4s
+        SQXTN   v24.4h, v24.4s
+        SQXTN   v25.4h, v25.4s
+        SQXTN   v26.4h, v26.4s
+        SQXTN   v27.4h, v27.4s
+        LD1R    {v2.8h}, [x11], 2   // add bias
+
+        SQXTN2  v16.8h, v20.4s
+        SQXTN2  v17.8h, v21.4s
+        SQXTN2  v18.8h, v22.4s
+        SQXTN2  v19.8h, v23.4s
+        SQXTN2  v24.8h, v28.4s
+        SQXTN2  v25.8h, v29.4s
+        SQXTN2  v26.8h, v30.4s
+        SQXTN2  v27.8h, v31.4s
+
+        SQADD   v16.8h, v16.8h, v2.8h
+        SQADD   v17.8h, v17.8h, v2.8h
+        SQADD   v18.8h, v18.8h, v2.8h
+        SQADD   v19.8h, v19.8h, v2.8h
+        SQADD   v24.8h, v24.8h, v2.8h
+        SQADD   v25.8h, v25.8h, v2.8h
+        SQADD   v26.8h, v26.8h, v2.8h
+        SQADD   v27.8h, v27.8h, v2.8h
+        LD1R    {v0.16b}, [x11], 1  // clamp min value
+
+        SQXTN    v4.8b, v16.8h
+        SQXTN    v5.8b, v17.8h
+        SQXTN    v6.8b, v18.8h
+        SQXTN    v7.8b, v19.8h
+        LD1R    {v1.16b}, [x11]     // clamp max value
+        SQXTN2   v4.16b, v24.8h
+        SQXTN2   v5.16b, v25.8h
+        SQXTN2   v6.16b, v26.8h
+        SQXTN2   v7.16b, v27.8h
+        LDR     x12, [sp]   // cn_stride
+
+        SMAX    v4.16b, v4.16b, v0.16b
+        SMAX    v5.16b, v5.16b, v0.16b
+        SMAX    v6.16b, v6.16b, v0.16b
+        SMAX    v7.16b, v7.16b, v0.16b
+        SUBS    x1, x1, 16
+        SMIN    v4.16b, v4.16b, v1.16b
+        SMIN    v5.16b, v5.16b, v1.16b
+        SMIN    v6.16b, v6.16b, v1.16b
+        SMIN    v7.16b, v7.16b, v1.16b
+        B.LO    2f
+
+        # Store full 4 x 16
+        ST1     {v4.16b}, [x6], x12
+        SUB      x3,  x3, x2         // a0 -= kc
+        ST1     {v5.16b}, [x8], x12
+        SUB     x15, x15, x2         // a1 -= kc
+        ST1     {v6.16b}, [x9], x12
+        SUB     x13, x13, x2         // a2 -= kc
+        ST1     {v7.16b}, [x7], x12
+        SUB      x4,  x4, x2         // a3 -= kc
+        B.NE    0b
+        RET
+
+        # Store odd width
+        .p2align 3
+2:
+        TBZ     x1, 3, 3f
+        STR     d4, [x6], 8
+        DUP     d4, v4.d[1]
+        STR     d5, [x8], 8
+        DUP     d5, v5.d[1]
+        STR     d6, [x9], 8
+        DUP     d6, v6.d[1]
+        STR     d7, [x7], 8
+        DUP     d7, v7.d[1]
+3:
+        TBZ     x1, 2, 4f
+        STR     s4, [x6], 4
+        DUP     s4, v4.s[1]
+        STR     s5, [x8], 4
+        DUP     s5, v5.s[1]
+        STR     s6, [x9], 4
+        DUP     s6, v6.s[1]
+        STR     s7, [x7], 4
+        DUP     s7, v7.s[1]
+4:
+        TBZ     x1, 1, 5f
+        ST1     {v4.h}[0], [x6], 2
+        DUP     h4, v4.h[1]
+        ST1     {v5.h}[0], [x8], 2
+        DUP     h5, v5.h[1]
+        ST1     {v6.h}[0], [x9], 2
+        DUP     h6, v6.h[1]
+        ST1     {v7.h}[0], [x7], 2
+        DUP     h7, v7.h[1]
+5:
+        TBZ     x1, 0, 6f
+        ST1     {v4.b}[0], [x6]
+        ST1     {v5.b}[0], [x8]
+        ST1     {v6.b}[0], [x9]
+        ST1     {v7.b}[0], [x7]
+6:
+        RET
+
+END_FUNCTION xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld32
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-gemm/4x16c4-aarch64-neondot-ld64.S b/src/qs8-gemm/4x16c4-aarch64-neondot-ld64.S
index 3f061b2..12e132d 100644
--- a/src/qs8-gemm/4x16c4-aarch64-neondot-ld64.S
+++ b/src/qs8-gemm/4x16c4-aarch64-neondot-ld64.S
@@ -8,7 +8,7 @@
 # void xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
-#     size_t kc,                 x2 / x0 / x10
+#     size_t kc,                 x2 / x0
 #     const int8_t* restrict a,  x3
 #     size_t a_stride,           x4
 #     const void* restrict w,    x5
@@ -35,10 +35,12 @@
 
         # Clamp A and C pointers
         CMP      x0, 2             // if mr < 2
+        ADD      x2, x2, 3         // kc = (kc + 3) & ~3
         ADD     x15, x3, x4        // a1 = a0 + a_stride
         ADD      x8, x6, x7        // c1 = c0 + cm_stride
         CSEL    x15, x3, x15, LO   //   a1 = a0
         CSEL     x8, x6,  x8, LO   //   c1 = c0
+        BIC      x2, x2, 3
 
         ADD     x13, x15, x4       // a2 = a1 + a_stride
         ADD      x9,  x8, x7       // c2 = c1 + cm_stride
@@ -56,18 +58,17 @@
 0:
         # Load initial bias from w into accumulators
         LDP     q16, q20, [x5], 32
-        SUBS    x0, x2, 8          // k = kc - 8
         MOV     v17.16b, v16.16b
         MOV     v18.16b, v16.16b
         LDP     q24, q28, [x5], 32
         MOV     v19.16b, v16.16b
         MOV     v21.16b, v20.16b
+        LDR     x11, [sp, 8]       // params
         MOV     v22.16b, v20.16b
         MOV     v23.16b, v20.16b
-        LDR     x11, [sp, 8]      // params
+        SUBS    x0, x2, 8          // k = kc - 8
         MOV     v25.16b, v24.16b
         MOV     v26.16b, v24.16b
-        AND     x10, x2, 7         // kc remainder 0 to 7
         MOV     v27.16b, v24.16b
         MOV     v29.16b, v28.16b
         MOV     v30.16b, v28.16b
@@ -79,23 +80,23 @@
         .p2align 3
 1:
         LDR     d0,  [x3], 8
-        LDR     q4, [x5], 16
+        LDR     q4,  [x5], 16
         LDR     d1, [x15], 8
         LDR     d2, [x13], 8
         LDR     d3,  [x4], 8
+        LDR     q5,  [x5], 16
         SDOT    v16.4s, v4.16b,  v0.4b[0]
-        LDR     q5, [x5], 16
         SDOT    v17.4s, v4.16b,  v1.4b[0]
+        LDP     q6, q7, [x5], 32
         SDOT    v18.4s, v4.16b,  v2.4b[0]
         SDOT    v19.4s, v4.16b,  v3.4b[0]
         SDOT    v20.4s, v5.16b,  v0.4b[0]
-        LDP     q6, q7, [x5], 32
         SDOT    v21.4s, v5.16b,  v1.4b[0]
         SDOT    v22.4s, v5.16b,  v2.4b[0]
         SDOT    v23.4s, v5.16b,  v3.4b[0]
         SDOT    v24.4s, v6.16b, v0.4b[0]
-        LDP      q4,  q5, [x5], 32
         SDOT    v25.4s, v6.16b, v1.4b[0]
+        LDP     q4, q5, [x5], 32
         SDOT    v26.4s, v6.16b, v2.4b[0]
         SDOT    v27.4s, v6.16b, v3.4b[0]
         SDOT    v28.4s, v7.16b, v0.4b[0]
@@ -103,8 +104,8 @@
         SDOT    v30.4s, v7.16b, v2.4b[0]
         SDOT    v31.4s, v7.16b, v3.4b[0]
         SDOT    v16.4s, v4.16b,  v0.4b[1]
-        LDP      q6,  q7, [x5], 32
         SDOT    v17.4s, v4.16b,  v1.4b[1]
+        LDP     q6, q7, [x5], 32
         SDOT    v18.4s, v4.16b,  v2.4b[1]
         SDOT    v19.4s, v4.16b,  v3.4b[1]
         SDOT    v20.4s, v5.16b,  v0.4b[1]
@@ -118,18 +119,16 @@
         SDOT    v28.4s, v7.16b,  v0.4b[1]
         SDOT    v29.4s, v7.16b,  v1.4b[1]
         SDOT    v30.4s, v7.16b,  v2.4b[1]
-        SDOT    v31.4s, v7.16b,  v3.4b[1]
         SUBS    x0, x0, 8
+        SDOT    v31.4s, v7.16b,  v3.4b[1]
         B.HS    1b
 
-        # Is there a remainder?- 1 to 7 bytes of A
-        CBNZ    x10, 3f
+        # Is there a remainder?- 4 bytes of A
+        TBNZ    x0, 2, 3f
 
-        .p2align 3
 2:
         # Apply params - scale, shift, bias and clamp
-        LD1R    {v0.4s}, [x11], 4
-        LD1R    {v1.4s}, [x11], 4
+        LD2R    {v0.4s, v1.4s}, [x11], 8
         CMEQ    v2.4s, v1.4s, 0
 
         BIC     v4.16b, v16.16b, v2.16b
@@ -272,21 +271,21 @@
         B.NE    0b
         RET
 
-        # Remainder- 1 to 7 bytes of A
+        # Remainder- 4 bytes of A
         .p2align 3
 3:
-        LD1     {v0.8b},  [x3], x10
-        LDP     q4, q5, [x5], 32
-        LD1     {v1.8b}, [x15], x10
-        LD1     {v2.8b}, [x13], x10
-        LD1     {v3.8b},  [x4], x10
-        CMP     x10, 4
+        LDR     s0,  [x3], 4
+        LDR     q4, [x5], 16
+        LDR     s1, [x15], 4
+        LDR     s2, [x13], 4
+        LDR     s3,  [x4], 4
         SDOT    v16.4s, v4.16b,  v0.4b[0]
+        LDR     q5, [x5], 16
         SDOT    v17.4s, v4.16b,  v1.4b[0]
         SDOT    v18.4s, v4.16b,  v2.4b[0]
         SDOT    v19.4s, v4.16b,  v3.4b[0]
-        LDP     q6, q7, [x5], 32
         SDOT    v20.4s, v5.16b,  v0.4b[0]
+        LDP     q6, q7, [x5], 32
         SDOT    v21.4s, v5.16b,  v1.4b[0]
         SDOT    v22.4s, v5.16b,  v2.4b[0]
         SDOT    v23.4s, v5.16b,  v3.4b[0]
@@ -298,25 +297,6 @@
         SDOT    v29.4s, v7.16b, v1.4b[0]
         SDOT    v30.4s, v7.16b, v2.4b[0]
         SDOT    v31.4s, v7.16b, v3.4b[0]
-        B.LS    2b
-        LDP      q4,  q5, [x5], 32
-        SDOT    v16.4s, v4.16b,  v0.4b[1]
-        SDOT    v17.4s, v4.16b,  v1.4b[1]
-        SDOT    v18.4s, v4.16b,  v2.4b[1]
-        SDOT    v19.4s, v4.16b,  v3.4b[1]
-        LDP      q6,  q7, [x5], 32
-        SDOT    v20.4s, v5.16b,  v0.4b[1]
-        SDOT    v21.4s, v5.16b,  v1.4b[1]
-        SDOT    v22.4s, v5.16b,  v2.4b[1]
-        SDOT    v23.4s, v5.16b,  v3.4b[1]
-        SDOT    v24.4s, v6.16b,  v0.4b[1]
-        SDOT    v25.4s, v6.16b,  v1.4b[1]
-        SDOT    v26.4s, v6.16b,  v2.4b[1]
-        SDOT    v27.4s, v6.16b,  v3.4b[1]
-        SDOT    v28.4s, v7.16b,  v0.4b[1]
-        SDOT    v29.4s, v7.16b,  v1.4b[1]
-        SDOT    v30.4s, v7.16b,  v2.4b[1]
-        SDOT    v31.4s, v7.16b,  v3.4b[1]
         B       2b
 
         # Store odd width
diff --git a/src/qs8-gemm/MRx16c8-avx512skx.c.in b/src/qs8-gemm/MRx16c8-avx512skx.c.in
index 973c81c..f28a7db 100644
--- a/src/qs8-gemm/MRx16c8-avx512skx.c.in
+++ b/src/qs8-gemm/MRx16c8-avx512skx.c.in
@@ -12,6 +12,7 @@
 
 #include <xnnpack/gemm.h>
 #include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
 
 
 $GEMM_SUFFIX = "_xw" if VARIANT == "EXTENDED" else ""
@@ -36,6 +37,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   $for M in range(1, MR):
@@ -196,10 +198,10 @@
         _mm_storeu_si128((__m128i*) c1, _mm256_extracti128_si256(vout01x0123456789ABCDEF, 1));
 
         $for M in range(MR):
-          a${M} = (const int8_t*) ((uintptr_t) a${M} - k);
+          c${M} = (int8_t*) ((uintptr_t) c${M} + cn_stride);
 
         $for M in range(MR):
-          c${M} = (int8_t*) ((uintptr_t) c${M} + cn_stride);
+          a${M} = (const int8_t*) ((uintptr_t) a${M} - kc);
 
         nc -= 16;
       } else {
diff --git a/src/qs8-gemm/MRx4c2-sse.c.in b/src/qs8-gemm/MRx4c2-sse.c.in
index d48eaa5..53eb918 100644
--- a/src/qs8-gemm/MRx4c2-sse.c.in
+++ b/src/qs8-gemm/MRx4c2-sse.c.in
@@ -19,6 +19,7 @@
   #include <${SSE_HEADER}>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 $LOAD_SUFFIX = {"LD128": "_ld128", "LD64": "_ld64", "EXTENDED": ""}[VARIANT]
@@ -45,6 +46,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   $for M in range(1, MR):
@@ -205,27 +207,6 @@
             $else:
               vacc${M}x0123 = _mm_add_epi32(vacc${M}x0123,
                 _mm_madd_epi16(_mm_shuffle_epi32(vxa${M}, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-          if (k > 6 * sizeof(int8_t)) {
-            $if VARIANT == "EXTENDED":
-              const __m128i vxb3 = _mm_load_si128((const __m128i*) w);
-              w = (const void*) ((uintptr_t) w + 8 * sizeof(int16_t));
-            $else:
-              const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-              $if SSE >= 4:
-                const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
-              $else:
-                const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-              w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-            $for M in range(MR):
-              $if SSE == 5:
-                vacc${M}x0123 = _mm_maddd_epi16(
-                  _mm_shuffle_epi32(vxa${M}, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc${M}x0123);
-              $else:
-                vacc${M}x0123 = _mm_add_epi32(vacc${M}x0123,
-                  _mm_madd_epi16(_mm_shuffle_epi32(vxa${M}, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-          }
         }
       }
     }
@@ -325,10 +306,10 @@
           *((uint32_t*) c${M}) = (uint32_t) _mm_cvtsi128_si32(vout);
 
       $for M in range(MR):
-        a${M} = (const int8_t*) ((uintptr_t) a${M} - kc);
+        c${M} = (int8_t*) ((uintptr_t) c${M} + cn_stride);
 
       $for M in range(MR):
-        c${M} = (int8_t*) ((uintptr_t) c${M} + cn_stride);
+        a${M} = (const int8_t*) ((uintptr_t) a${M} - kc);
 
       nc -= 4;
     } else {
diff --git a/src/qs8-gemm/MRx4c8-sse.c.in b/src/qs8-gemm/MRx4c8-sse.c.in
index 2e8dbe5..9e98cfd 100644
--- a/src/qs8-gemm/MRx4c8-sse.c.in
+++ b/src/qs8-gemm/MRx4c8-sse.c.in
@@ -19,6 +19,7 @@
   #include <${SSE_HEADER}>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 $LOAD_SUFFIX = {"LD128": "_ld128", "LD64": "_ld64", "EXTENDED": ""}[VARIANT]
@@ -45,6 +46,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   $for M in range(1, MR):
@@ -244,10 +246,10 @@
           *((uint32_t*) c${M}) = (uint32_t) _mm_cvtsi128_si32(vout);
 
       $for M in range(MR):
-        a${M} = (const int8_t*) ((uintptr_t) a${M} - k);
+        c${M} = (int8_t*) ((uintptr_t) c${M} + cn_stride);
 
       $for M in range(MR):
-        c${M} = (int8_t*) ((uintptr_t) c${M} + cn_stride);
+        a${M} = (const int8_t*) ((uintptr_t) a${M} - kc);
 
       nc -= 4;
     } else {
diff --git a/src/qs8-gemm/MRx4c8-wasmsimd.c.in b/src/qs8-gemm/MRx4c8-wasmsimd.c.in
index dbc36b2..a10c7dd 100644
--- a/src/qs8-gemm/MRx4c8-wasmsimd.c.in
+++ b/src/qs8-gemm/MRx4c8-wasmsimd.c.in
@@ -10,6 +10,7 @@
 #include <wasm_simd128.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 $LOAD_SUFFIX = {"LD128": "_ld128", "LD64": "_ld64", "EXTENDED": ""}[VARIANT]
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   $for M in range(1, MR):
@@ -170,10 +172,10 @@
         *((float*) c${M}) = (float) wasm_f32x4_extract_lane(vout, ${M});
 
       $for M in range(MR):
-        a${M} = (const int8_t*) ((uintptr_t) a${M} - k);
+        c${M} = (int8_t*) ((uintptr_t) c${M} + cn_stride);
 
       $for M in range(MR):
-        c${M} = (int8_t*) ((uintptr_t) c${M} + cn_stride);
+        a${M} = (const int8_t*) ((uintptr_t) a${M} - kc);
 
       nc -= 4;
     } else {
diff --git a/src/qs8-gemm/MRx8c8-avx2.c.in b/src/qs8-gemm/MRx8c8-avx2.c.in
index e403810..3f17e73 100644
--- a/src/qs8-gemm/MRx8c8-avx2.c.in
+++ b/src/qs8-gemm/MRx8c8-avx2.c.in
@@ -11,6 +11,7 @@
 
 #include <xnnpack/gemm.h>
 #include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
 
 
 $GEMM_SUFFIX = "_xw" if VARIANT == "EXTENDED" else ""
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   $for M in range(1, MR):
@@ -169,10 +171,10 @@
         _mm_storeh_pi((__m64*) c3, _mm_castsi128_ps(vout_hi));
 
       $for M in range(MR):
-        a${M} = (const int8_t*) ((uintptr_t) a${M} - k);
+        c${M} = (int8_t*) ((uintptr_t) c${M} + cn_stride);
 
       $for M in range(MR):
-        c${M} = (int8_t*) ((uintptr_t) c${M} + cn_stride);
+        a${M} = (const int8_t*) ((uintptr_t) a${M} - kc);
 
       nc -= 8;
     } else {
diff --git a/src/qs8-gemm/MRxNRc4-neondot.c.in b/src/qs8-gemm/MRxNRc4-neondot.c.in
index 7e56a4b..e9416c3 100644
--- a/src/qs8-gemm/MRxNRc4-neondot.c.in
+++ b/src/qs8-gemm/MRxNRc4-neondot.c.in
@@ -6,18 +6,14 @@
 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 $assert NR % 8 == 0
 $assert 8 <= NR <= 16
-
 #include <assert.h>
 
 #include <arm_neon.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
-// This kernel uses ARMv8.2 dot-product instructions.
-//
-// Scalar model: xnn_qs8_gemm_minmax_ukernel_${MR}x${NR}c4__scalar. Refer to
-// that kernel for more comments.
 void xnn_qs8_gemm_minmax_ukernel_${MR}x${NR}c4__neondot(
     size_t mr,
     size_t nc,
@@ -33,7 +29,12 @@
   assert(mr <= ${MR});
   assert(nc != 0);
   assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
 
+  kc = round_up_po2(kc, 4);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   $for M in range(1, MR):
@@ -76,7 +77,7 @@
       // Load a 8x${NR} block of weights.
       $for K in range(0, 8, 4):
         $for N in range(0, NR, 4):
-          const int8x16_t vb${ABC[K:K+4]}x${ABC[N:N+4]} = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
+          const int8x16_t vb${ABC[K:K+4]}x${ABC[N:N+4]} = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
 
       // Multiply-accumulate: ${MR}x8 * 8x${NR} --> ${MR}x${NR}.
       $for K in range(0, 8, 4):
@@ -86,39 +87,23 @@
 
       k -= 8 * sizeof(int8_t);
     }
-    // Handle up to 7 final positions of `k`
+    // Handle up to 4 final positions of `k`
     if XNN_UNLIKELY(k != 0) {
       // Load a ${MR}x4 block of activations.
       $for M in range(MR):
-        const int8x8_t va${M}x01234567 = vld1_s8(a${M}); a${M} += k;
+        const int8x8_t va${M}x01234567 = vld1_s8(a${M}); a${M} += 4;
 
       // Load a 4x${NR} block of weights.
       $for N in range(0, NR, 4):
-        const int8x16_t vb0123x${ABC[N:N+4]} = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
+        const int8x16_t vb0123x${ABC[N:N+4]} = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
 
       // Multiply-accumulate: ${MR}x4 * 4x${NR} --> ${MR}x${NR}.
       $for M in range(MR):
         $for N in range(0, NR, 4):
             vacc${M}x${ABC[N:N+4]} = vdotq_lane_s32(vacc${M}x${ABC[N:N+4]}, vb0123x${ABC[N:N+4]}, va${M}x01234567, 0);
-
-      if (k > 4) {
-        // Load a 4x${NR} block of weights.
-        $for N in range(0, NR, 4):
-          const int8x16_t vb4567x${ABC[N:N+4]} = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-
-        // Multiply-accumulate: ${MR}x4 * 4x${NR} --> ${MR}x${NR}.
-        $for M in range(MR):
-          $for N in range(0, NR, 4):
-              vacc${M}x${ABC[N:N+4]} = vdotq_lane_s32(vacc${M}x${ABC[N:N+4]}, vb4567x${ABC[N:N+4]}, va${M}x01234567, 1);
-      }
     }
-    // End of accumulation loop. The variable `kc` contains the amount by which
-    // we advanced the `va` pointers, so we rewind by this amount now.
-    $for M in range(MR):
-      a${M} = (const int8_t*)((uintptr_t)a${M} - kc);
 
     // Post-accumulation work
-
     const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
     const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
 
@@ -209,6 +194,9 @@
       $for M in range(MR):
         c${M} = (int8_t*) ((uintptr_t) c${M} + cn_stride);
 
+      $for M in range(MR):
+        a${M} = (const int8_t*) ((uintptr_t) a${M} - kc);
+
       nc -= ${NR};
     } else {
       // Final case where not all of the ${NR} columns fit in the destination.
diff --git a/src/qs8-gemm/MRxNRc4-scalar.c.in b/src/qs8-gemm/MRxNRc4-scalar.c.in
deleted file mode 100644
index 5f32eb6..0000000
--- a/src/qs8-gemm/MRxNRc4-scalar.c.in
+++ /dev/null
@@ -1,162 +0,0 @@
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <xnnpack/gemm.h>
-
-#include <xnnpack/scalar-utils.h>
-
-// This kernel is a scalar model for a kernel using ARMv8.2 dot-product
-// instructions.
-//
-// XNN_DISABLE_TSAN is used because this kernel reads up to 3 bytes past the
-// bounds of the `a` matrix region, which may be a race condition with
-// another thread. We deem this acceptable because the values that are
-// read out of bounds do not affect the result, and the the compiler can't know
-// about this undefined behavior.
-void xnn_qs8_gemm_minmax_ukernel_${MR}x${NR}c4__scalar(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    const int8_t* restrict a,
-    size_t a_stride,
-    const void* restrict w,
-    int8_t* restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN {
-  assert(mr != 0);
-  assert(mr <= ${MR});
-  assert(nc != 0);
-  assert(kc != 0);
-
-  const int8_t* a0 = a;
-  int8_t* c0 = c;
-  $for M in range(1, MR):
-    const int8_t* a${M} = (const int8_t*) ((uintptr_t) a${M-1} + a_stride);
-    int8_t* c${M} = (int8_t*) ((uintptr_t) c${M-1} + cm_stride);
-    $if M % 2 == 0:
-      if XNN_UNPREDICTABLE(mr <= ${M}) {
-        a${M} = a${M-1};
-        c${M} = c${M-1};
-      }
-    $elif M + 1 == MR:
-      if XNN_UNPREDICTABLE(mr != ${M+1}) {
-        a${M} = a${M-1};
-        c${M} = c${M-1};
-      }
-    $else:
-      if XNN_UNPREDICTABLE(mr < ${M+1}) {
-        a${M} = a${M-1};
-        c${M} = c${M-1};
-      }
-
-  // Loop over groups of ${NR} columns.
-  do {
-    // `vaccMN` is the accumulator at row `M`, column `N`.
-    // Initialize accumulators with bias. ${NR} bias values are loaded from the
-    // weight matrix, at the start of the group of ${NR} columns.
-    $for N in range(NR):
-      int32_t bias${N} = ((const int32_t*)w)[${N}];
-      $for M in range(MR):
-        int32_t vacc${M}${N} = bias${N};
-
-    w = (const void*)((uintptr_t)w + ${NR} * sizeof(int32_t));
-
-    // Inner accumulation loop along the ${NR} columns.
-    // Handle 4 rows at each iteration: this is key to modelling what an
-    // actual kernel using ARMv8.2 dot-product instructions would look like.
-    size_t k = 0;
-    while (k < kc) {
-      // Load a ${MR}x4 block of activations.
-      $for M in range(MR):
-        $for K in range(4):
-          int32_t va${M}${K} = *a${M}++;
-
-      // Load a 4x${NR} block of weights.
-      $for N in range(NR):
-        $for K in range(4):
-          int32_t vb${K}${N} = ((const int8_t*)w)[${K}];
-
-        w = (const void*)((uintptr_t)w + 4 * sizeof(int8_t));
-
-      // Multiply-accumulate: ${MR}x4 * 4x${NR} --> ${MR}x${NR}. The inner size 4 here means
-      // we're computing 4D dot-products, which makes this a model for
-      // a ARMv8.2 dot-product kernel.
-      $for M in range(MR):
-        $for N in range(NR):
-          $for K in range(4):
-            vacc${M}${N} += va${M}${K} * vb${K}${N};
-
-      k += 4 * sizeof(int8_t);
-    }
-    // End of accumulation loop. The variable `k` contains the amount by which
-    // we advanced the `va` pointers, so we rewind by this amount now.
-    $for M in range(MR):
-      a${M} = (const int8_t*)((uintptr_t)a${M} - k);
-
-    // Post-accumulation work
-
-    const int32_t vmultiplier = params->scalar.multiplier;
-    const int64_t vq31rounding = INT64_C(0x40000000);
-    const int32_t vremainder_mask = params->scalar.remainder_mask;
-    const uint32_t vshift = params->scalar.shift;
-    const int32_t vremainder_threshold = params->scalar.remainder_threshold;
-    const int32_t voutput_min = params->scalar.output_min_less_zero_point;
-    const int32_t voutput_max = params->scalar.output_max_less_zero_point;
-    const int32_t voutput_zero_point = params->scalar.output_zero_point;
-
-    $for M in range(MR):
-      $for N in range(NR):
-        const int64_t vproduct${M}${N} = (int64_t)vacc${M}${N} * (int64_t)vmultiplier;
-
-    $for M in range(MR):
-      $for N in range(NR):
-        const int32_t vq31product${M}${N} = (int32_t)(uint32_t)((uint64_t)(vproduct${M}${N} + vq31rounding) >> 31);
-
-    $for M in range(MR):
-      $for N in range(NR):
-        const int32_t vremainder${M}${N} = (vq31product${M}${N} & vremainder_mask) - (int32_t)(vq31product${M}${N} < 0);
-
-    $for M in range(MR):
-      $for N in range(NR):
-        int32_t vout${M}${N} = asr_s32(vq31product${M}${N}, vshift) + (int32_t)(vremainder${M}${N} > vremainder_threshold);
-
-    $for M in range(MR):
-      $for N in range(NR):
-        vout${M}${N} = vout${M}${N} < voutput_min ? voutput_min : vout${M}${N};
-
-    $for M in range(MR):
-      $for N in range(NR):
-        vout${M}${N} = vout${M}${N} > voutput_max ? voutput_max : vout${M}${N};
-
-    $for M in range(MR):
-      $for N in range(NR):
-        vout${M}${N} += voutput_zero_point;
-
-    if XNN_LIKELY (nc >= ${NR}) {
-      // Main case where there the ${NR} columns fit in the destination.
-      $for M in range(MR):
-        $for N in range(NR):
-          c${M}[${N}] = (int8_t) vout${M}${N};
-
-      // Advance to the next ${NR} columns.
-      $for M in range(MR):
-        c${M} = (int8_t*)((uintptr_t)c${M} + cn_stride);
-
-      nc -= ${NR};
-    } else {
-      // Final case where not all of the ${NR} columns fit in the destination.
-      $for N in range(NR):
-        if (nc > ${N}) {
-          $for M in range(MR):
-            c${M}[${N}] = vout${M}${N};
-        }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/qs8-gemm/c16-neon-mlal-padal.c.in b/src/qs8-gemm/c16-neon-mlal-padal.c.in
new file mode 100644
index 0000000..ed65c1e
--- /dev/null
+++ b/src/qs8-gemm/c16-neon-mlal-padal.c.in
@@ -0,0 +1,256 @@
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+$assert NR % 8 == 0
+$assert 8 <= NR <= 16
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_${MR}x${NR}c16__neon_mlal_padal(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= ${MR});
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 16);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  $for M in range(1, MR):
+    const int8_t* a${M} = (const int8_t*) ((uintptr_t) a${M-1} + a_stride);
+    int8_t* c${M} = (int8_t*) ((uintptr_t) c${M-1} + cm_stride);
+    $if M % 2 == 0:
+      if XNN_UNPREDICTABLE(mr <= ${M}) {
+        a${M} = a${M-1};
+        c${M} = c${M-1};
+      }
+    $elif M + 1 == MR:
+      if XNN_UNPREDICTABLE(mr != ${M+1}) {
+        a${M} = a${M-1};
+        c${M} = c${M-1};
+      }
+    $else:
+      if XNN_UNPREDICTABLE(mr < ${M+1}) {
+        a${M} = a${M-1};
+        c${M} = c${M-1};
+      }
+
+  do {
+    $for N in range(NR):
+      int32x4_t vacc0x${N} = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    $for M in range(1, MR):
+      $for N in range(NR):
+        int32x4_t vacc${M}x${N} = vacc0x${N};
+
+    // KC loop of 16
+    size_t k = 0;
+    while (k < kc) {
+      $for M in range(MR):
+        const int8x16_t va${M} = vld1q_s8(a${M}); a${M} += 16;
+
+      $for N in range(NR):
+        const int8x16_t vb${N} = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+
+      $for N in range(NR):
+        $for M in range(MR):
+          int16x8_t vprod${M}x${N} = vmull_s8(vget_low_s8(vb${N}), vget_low_s8(va${M}));
+        $for M in range(MR):
+          vprod${M}x${N} = vmlal_s8(vprod${M}x${N}, vget_high_s8(vb${N}), vget_high_s8(va${M}));
+        $for M in range(MR):
+          vacc${M}x${N} = vpadalq_s16(vacc${M}x${N}, vprod${M}x${N});
+
+      k += 16 * sizeof(int8_t);
+    }
+
+#if XNN_ARCH_ARM64
+    $for M in range(MR):
+      $for N in range(0, NR, 4):
+        const int32x4_t vsum${M}x${ABC[N:N+2]} = vpaddq_s32(vacc${M}x${N}, vacc${M}x${N+1});
+        const int32x4_t vsum${M}x${ABC[N+2:N+4]} = vpaddq_s32(vacc${M}x${N+2}, vacc${M}x${N+3});
+    $for M in range(MR):
+      $for N in range(0, NR, 4):
+        int32x4_t vacc${M}x${ABC[N:N+4]} = vpaddq_s32(vsum${M}x${ABC[N:N+2]}, vsum${M}x${ABC[N+2:N+4]});
+#else
+    $for M in range(MR):
+      $for N in range(0, NR, 4):
+        const int32x2_t vpsum${M}x${ABC[N]} = vadd_s32(vget_low_s32(vacc${M}x${N}), vget_high_s32(vacc${M}x${N}));
+        const int32x2_t vpsum${M}x${ABC[N+1]} = vadd_s32(vget_low_s32(vacc${M}x${N+1}), vget_high_s32(vacc${M}x${N+1}));
+        const int32x2_t vpsum${M}x${ABC[N+2]} = vadd_s32(vget_low_s32(vacc${M}x${N+2}), vget_high_s32(vacc${M}x${N+2}));
+        const int32x2_t vpsum${M}x${ABC[N+3]} = vadd_s32(vget_low_s32(vacc${M}x${N+3}), vget_high_s32(vacc${M}x${N+3}));
+        const int32x2_t vsum${M}x${ABC[N:N+2]} = vpadd_s32(vpsum${M}x${ABC[N]}, vpsum${M}x${ABC[N+1]});
+        const int32x2_t vsum${M}x${ABC[N+2:N+4]} = vpadd_s32(vpsum${M}x${ABC[N+2]}, vpsum${M}x${ABC[N+3]});
+        int32x4_t vacc${M}x${ABC[N:N+4]} = vcombine_s32(vsum${M}x${ABC[N:N+2]}, vsum${M}x${ABC[N+2:N+4]} );
+#endif
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    $for M in range(MR):
+      $for N in range(0, NR, 4):
+        vacc${M}x${ABC[N:N+4]} = vqrdmulhq_s32(vacc${M}x${ABC[N:N+4]}, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    $for M in range(MR):
+      $for N in range(0, NR, 4):
+        vacc${M}x${ABC[N:N+4]} = vsraq_n_s32(vacc${M}x${ABC[N:N+4]}, vbicq_s32(vacc${M}x${ABC[N:N+4]}, vzero_shift_mask), 31);
+
+    $for M in range(MR):
+      $for N in range(0, NR, 4):
+        vacc${M}x${ABC[N:N+4]} = vrshlq_s32(vacc${M}x${ABC[N:N+4]}, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    $for M in range(MR):
+      $for N in range(0, NR, 8):
+        const int16x8_t vacc${M}x${ABC[N:N+8]} = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc${M}x${ABC[N:N+4]}), vacc${M}x${ABC[N+4:N+8]}), voutput_zero_point);
+    $for M in range(MR):
+      $for N in range(0, NR, 16):
+        $if N + 8 < NR:
+          int8x16_t vout${M}x${ABC[N:N+16]} = vqmovn_high_s16(vqmovn_s16(vacc${M}x${ABC[N:N+8]}), vacc${M}x${ABC[N+8:N+16]});
+        $elif M % 2 == 1:
+          int8x16_t vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vqmovn_high_s16(vqmovn_s16(vacc${M-1}x${ABC[N:N+8]}), vacc${M}x${ABC[N:N+8]});
+        $elif M + 1 == MR:
+          int8x8_t vout${M}x${ABC[N:N+8]} = vqmovn_s16(vacc${M}x${ABC[N:N+8]});
+#else
+    $for M in range(MR):
+      $for N in range(0, NR, 8):
+        const int16x8_t vacc${M}x${ABC[N:N+8]} = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc${M}x${ABC[N:N+4]}), vqmovn_s32(vacc${M}x${ABC[N+4:N+8]})), voutput_zero_point);
+
+    $for M in range(MR):
+      $for N in range(0, NR, 16):
+        $if N + 8 < NR:
+          int8x16_t vout${M}x${ABC[N:N+16]} = vcombine_s8(vqmovn_s16(vacc${M}x${ABC[N:N+8]}), vqmovn_s16(vacc${M}x${ABC[N+8:N+16]}));
+        $elif M % 2 == 1:
+          int8x16_t vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vcombine_s8(vqmovn_s16(vacc${M-1}x${ABC[N:N+8]}), vqmovn_s16(vacc${M}x${ABC[N:N+8]}));
+        $elif M + 1 == MR:
+          int8x8_t vout${M}x${ABC[N:N+8]} = vqmovn_s16(vacc${M}x${ABC[N:N+8]});
+#endif
+    $if NR == 8 and MR == 1:
+      const int8x8_t voutput_min = vld1_dup_s8(&params->neon.output_min);
+      const int8x8_t voutput_max = vld1_dup_s8(&params->neon.output_max);
+    $else:
+      const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+      const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    $for M in range(MR):
+      $for N in range(0, NR, 16):
+        $if N + 8 < NR:
+          vout${M}x${ABC[N:N+16]} = vmaxq_s8(vout${M}x${ABC[N:N+16]}, voutput_min);
+        $elif M % 2 == 1:
+          vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vmaxq_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}, voutput_min);
+        $elif M + 1 == MR:
+          $if NR == 8 and MR == 1:
+            vout${M}x${ABC[N:N+8]} = vmax_s8(vout${M}x${ABC[N:N+8]}, voutput_min);
+          $else:
+            vout${M}x${ABC[N:N+8]} = vmax_s8(vout${M}x${ABC[N:N+8]}, vget_low_s8(voutput_min));
+
+    $for M in range(MR):
+      $for N in range(0, NR, 16):
+        $if N + 8 < NR:
+          vout${M}x${ABC[N:N+16]} = vminq_s8(vout${M}x${ABC[N:N+16]}, voutput_max);
+        $elif M % 2 == 1:
+          vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vminq_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}, voutput_max);
+        $elif M + 1 == MR:
+          $if NR == 8 and MR == 1:
+            vout${M}x${ABC[N:N+8]} = vmin_s8(vout${M}x${ABC[N:N+8]}, voutput_max);
+          $else:
+            vout${M}x${ABC[N:N+8]} = vmin_s8(vout${M}x${ABC[N:N+8]}, vget_low_s8(voutput_max));
+
+    if (nc >= ${NR}) {
+      $for M in range(MR):
+        $for N in range(0, NR, 16):
+          $if N + 8 < NR:
+            vst1q_s8(c${M} + ${N}, vout${M}x${ABC[N:N+16]});
+          $elif M % 2 == 1:
+            vst1_s8(c${M-1} + ${N}, vget_low_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}));
+            vst1_s8(c${M} + ${N}, vget_high_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}));
+          $elif M + 1 == MR:
+            vst1_s8(c${M} + ${N}, vout${M}x${ABC[N:N+8]});
+
+      $for M in range(MR):
+        c${M} = (int8_t*) ((uintptr_t) c${M} + cn_stride);
+
+      $for M in range(MR):
+        a${M} = (const int8_t*) ((uintptr_t) a${M} - kc);
+
+      nc -= ${NR};
+    } else {
+      $if NR == 16:
+        $for M in range(MR):
+          $if M % 2 == 1:
+            int8x16_t vout${M-1}x01234567_${M}x01234567 = vcombine_s8(vget_low_s8(vout${M-1}x0123456789ABCDEF), vget_low_s8(vout${M}x0123456789ABCDEF));
+          $elif M + 1 == MR:
+            int8x8_t vout${M}x01234567 = vget_low_s8(vout${M}x0123456789ABCDEF);
+        if (nc & 8) {
+          $for M in range(MR):
+            $if M % 2 == 1:
+              vst1_s8(c${M-1}, vget_low_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]})); c${M-1} += 8;
+              vst1_s8(c${M}, vget_high_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]})); c${M} += 8;
+            $elif M + 1 == MR:
+              vst1_s8(c${M}, vout${M}x${ABC[N:N+8]}); c${M} += 8;
+          $for M in range(MR):
+            $if M % 2 == 1:
+              vout${M-1}x01234567_${M}x01234567 = vcombine_s8(vget_high_s8(vout${M-1}x0123456789ABCDEF), vget_high_s8(vout${M}x0123456789ABCDEF));
+            $elif M + 1 == MR:
+              vout${M}x01234567 = vget_high_s8(vout${M}x0123456789ABCDEF);
+        }
+      if (nc & 4) {
+        $for M in range(MR):
+          $if M % 2 == 1:
+            vst1q_lane_u32(__builtin_assume_aligned(c${M-1}, 1), vreinterpretq_u32_s8(vout${M-1}x01234567_${M}x01234567), 0); c${M-1} += 4;
+            vst1q_lane_u32(__builtin_assume_aligned(c${M}, 1), vreinterpretq_u32_s8(vout${M-1}x01234567_${M}x01234567), 2); c${M} += 4;
+          $elif M + 1 == MR:
+            vst1_lane_u32(__builtin_assume_aligned(c${M}, 1), vreinterpret_u32_s8(vout${M}x01234567), 0); c${M} += 4;
+        $for M in range(MR):
+          $if M % 2 == 1:
+            vout${M-1}x01234567_${M}x01234567 = vextq_s8(vout${M-1}x01234567_${M}x01234567, vout${M-1}x01234567_${M}x01234567, 4);
+          $elif M + 1 == MR:
+            vout${M}x01234567 = vext_s8(vout${M}x01234567, vout${M}x01234567, 4);
+      }
+      if (nc & 2) {
+        $for M in range(MR):
+          $if M % 2 == 1:
+            vst1q_lane_u16(__builtin_assume_aligned(c${M-1}, 1), vreinterpretq_u16_s8(vout${M-1}x01234567_${M}x01234567), 0); c${M-1} += 2;
+            vst1q_lane_u16(__builtin_assume_aligned(c${M}, 1), vreinterpretq_u16_s8(vout${M-1}x01234567_${M}x01234567), 4); c${M} += 2;
+          $elif M + 1 == MR:
+            vst1_lane_u16(__builtin_assume_aligned(c${M}, 1), vreinterpret_u16_s8(vout${M}x01234567), 0); c${M} += 2;
+        $for M in range(MR):
+          $if M % 2 == 1:
+            vout${M-1}x01234567_${M}x01234567 = vextq_s8(vout${M-1}x01234567_${M}x01234567, vout${M-1}x01234567_${M}x01234567, 2);
+          $elif M + 1 == MR:
+            vout${M}x01234567 = vext_s8(vout${M}x01234567, vout${M}x01234567, 2);
+      }
+      if (nc & 1) {
+        $for M in range(MR):
+          $if M % 2 == 1:
+            vst1q_lane_s8(c${M-1}, vout${M-1}x01234567_${M}x01234567, 0);
+            vst1q_lane_s8(c${M}, vout${M-1}x01234567_${M}x01234567, 8);
+          $elif M + 1 == MR:
+            vst1_lane_s8(c${M}, vout${M}x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/c2-neon-mull-padal-dup.c.in b/src/qs8-gemm/c2-neon-mull-padal-dup.c.in
new file mode 100644
index 0000000..cbc793c
--- /dev/null
+++ b/src/qs8-gemm/c2-neon-mull-padal-dup.c.in
@@ -0,0 +1,292 @@
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+$assert NR % 8 == 0
+$assert 8 <= NR <= 16
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_${MR}x${NR}c2__neon_${"mlal" if MLA else "mull"}_padal_dup(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= ${MR});
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  $for M in range(1, MR):
+    const int8_t* a${M} = (const int8_t*) ((uintptr_t) a${M-1} + a_stride);
+    int8_t* c${M} = (int8_t*) ((uintptr_t) c${M-1} + cm_stride);
+    $if M % 2 == 0:
+      if XNN_UNPREDICTABLE(mr <= ${M}) {
+        a${M} = a${M-1};
+        c${M} = c${M-1};
+      }
+    $elif M + 1 == MR:
+      if XNN_UNPREDICTABLE(mr != ${M+1}) {
+        a${M} = a${M-1};
+        c${M} = c${M-1};
+      }
+    $else:
+      if XNN_UNPREDICTABLE(mr < ${M+1}) {
+        a${M} = a${M-1};
+        c${M} = c${M-1};
+      }
+
+  do {
+    $for N in range(0, NR, 4):
+      int32x4_t vacc0x${ABC[N:N+4]} = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    $for M in range(1, MR):
+      $for N in range(0, NR, 4):
+        int32x4_t vacc${M}x${ABC[N:N+4]} = vacc0x${ABC[N:N+4]};
+
+    size_t k = kc;
+
+    $if MLA:
+      while (k >= 16 * sizeof(int8_t)) {
+        $for M in range(MR):
+          const int8x8_t va${M}x0 = vld1_s8(a${M}); a${M} += 8;
+          const int8x8_t va${M}x1 = vld1_s8(a${M}); a${M} += 8;
+
+        $for K in range(4):
+          $for N in range(0, NR, 4):
+            const int8x8_t vb${ABC[N:N+4]}c${K}x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        $for K in range(4):
+          $for N in range(0, NR, 4):
+            $for M in range(MR):
+              int16x8_t vprod${M}x${ABC[N:N+4]}c${K} = vmull_s8(vb${ABC[N:N+4]}c${K}x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va${M}x0), ${K})));
+            const int8x8_t vb${ABC[N:N+4]}c${K}x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            $for M in range(MR):
+              vprod${M}x${ABC[N:N+4]}c${K} = vmlal_s8(vprod${M}x${ABC[N:N+4]}c${K}, vb${ABC[N:N+4]}c${K}x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va${M}x1), ${K})));
+            $for M in range(MR):
+              vacc${M}x${ABC[N:N+4]} = vpadalq_s16(vacc${M}x${ABC[N:N+4]}, vprod${M}x${ABC[N:N+4]}c${K});
+
+        k -= 16 * sizeof(int8_t);
+      }
+
+    ${"if" if MLA else "while"} (k >= 8 * sizeof(int8_t)) {
+      $for M in range(MR):
+        const int8x8_t va${M} = vld1_s8(a${M}); a${M} += 8;
+
+      $for K in range(4):
+        $for N in range(0, NR, 4):
+          const int8x8_t vb${ABC[N:N+4]}c${K} = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      $for M in range(MR):
+        $for N in range(0, NR, 4):
+          $for K in range(4):
+            const int16x8_t vprod${M}x${ABC[N:N+4]}c${K} = vmull_s8(vb${ABC[N:N+4]}c${K}, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va${M}), ${K})));
+          $for K in range(4):
+            vacc${M}x${ABC[N:N+4]} = vpadalq_s16(vacc${M}x${ABC[N:N+4]}, vprod${M}x${ABC[N:N+4]}c${K});
+
+      k -= 8 * sizeof(int8_t);
+    }
+
+    if XNN_UNLIKELY(k != 0) {
+      $for M in range(MR):
+        const int8x8_t va${M} = vld1_s8(a${M}); a${M} = (const int8_t*) ((uintptr_t) a${M} + k);
+
+      $for N in range(0, NR, 4):
+        const int8x8_t vb${ABC[N:N+4]}c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      $for M in range(MR):
+        $for N in range(0, NR, 4):
+          const int16x8_t vprod${M}x${ABC[N:N+4]}c0 = vmull_s8(vb${ABC[N:N+4]}c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va${M}), 0)));
+          vacc${M}x${ABC[N:N+4]} = vpadalq_s16(vacc${M}x${ABC[N:N+4]}, vprod${M}x${ABC[N:N+4]}c0);
+
+      if (k > 2 * sizeof(int8_t)) {
+        $for N in range(0, NR, 4):
+          const int8x8_t vb${ABC[N:N+4]}c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        $for M in range(MR):
+          $for N in range(0, NR, 4):
+            const int16x8_t vprod${M}x${ABC[N:N+4]}c1 = vmull_s8(vb${ABC[N:N+4]}c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va${M}), 1)));
+            vacc${M}x${ABC[N:N+4]} = vpadalq_s16(vacc${M}x${ABC[N:N+4]}, vprod${M}x${ABC[N:N+4]}c1);
+
+        if (k > 4 * sizeof(int8_t)) {
+          $for N in range(0, NR, 4):
+            const int8x8_t vb${ABC[N:N+4]}c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          $for M in range(MR):
+            $for N in range(0, NR, 4):
+              const int16x8_t vprod${M}x${ABC[N:N+4]}c2 = vmull_s8(vb${ABC[N:N+4]}c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va${M}), 2)));
+              vacc${M}x${ABC[N:N+4]} = vpadalq_s16(vacc${M}x${ABC[N:N+4]}, vprod${M}x${ABC[N:N+4]}c2);
+        }
+      }
+    }
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    $for M in range(MR):
+      $for N in range(0, NR, 4):
+        vacc${M}x${ABC[N:N+4]} = vqrdmulhq_s32(vacc${M}x${ABC[N:N+4]}, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    $for M in range(MR):
+      $for N in range(0, NR, 4):
+        vacc${M}x${ABC[N:N+4]} = vsraq_n_s32(vacc${M}x${ABC[N:N+4]}, vbicq_s32(vacc${M}x${ABC[N:N+4]}, vzero_shift_mask), 31);
+
+    $for M in range(MR):
+      $for N in range(0, NR, 4):
+        vacc${M}x${ABC[N:N+4]} = vrshlq_s32(vacc${M}x${ABC[N:N+4]}, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    $for M in range(MR):
+      $for N in range(0, NR, 8):
+        const int16x8_t vacc${M}x${ABC[N:N+8]} = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc${M}x${ABC[N:N+4]}), vacc${M}x${ABC[N+4:N+8]}), voutput_zero_point);
+
+    $for M in range(MR):
+      $for N in range(0, NR, 16):
+        $if N + 8 < NR:
+          int8x16_t vout${M}x${ABC[N:N+16]} = vqmovn_high_s16(vqmovn_s16(vacc${M}x${ABC[N:N+8]}), vacc${M}x${ABC[N+8:N+16]});
+        $elif M % 2 == 1:
+          int8x16_t vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vqmovn_high_s16(vqmovn_s16(vacc${M-1}x${ABC[N:N+8]}), vacc${M}x${ABC[N:N+8]});
+        $elif M + 1 == MR:
+          int8x8_t vout${M}x${ABC[N:N+8]} = vqmovn_s16(vacc${M}x${ABC[N:N+8]});
+#else
+    $for M in range(MR):
+      $for N in range(0, NR, 8):
+        const int16x8_t vacc${M}x${ABC[N:N+8]} = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc${M}x${ABC[N:N+4]}), vqmovn_s32(vacc${M}x${ABC[N+4:N+8]})), voutput_zero_point);
+
+    $for M in range(MR):
+      $for N in range(0, NR, 16):
+        $if N + 8 < NR:
+          int8x16_t vout${M}x${ABC[N:N+16]} = vcombine_s8(vqmovn_s16(vacc${M}x${ABC[N:N+8]}), vqmovn_s16(vacc${M}x${ABC[N+8:N+16]}));
+        $elif M % 2 == 1:
+          int8x16_t vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vcombine_s8(vqmovn_s16(vacc${M-1}x${ABC[N:N+8]}), vqmovn_s16(vacc${M}x${ABC[N:N+8]}));
+        $elif M + 1 == MR:
+          int8x8_t vout${M}x${ABC[N:N+8]} = vqmovn_s16(vacc${M}x${ABC[N:N+8]});
+#endif
+    $if NR == 8 and MR == 1:
+      const int8x8_t voutput_min = vld1_dup_s8(&params->neon.output_min);
+      const int8x8_t voutput_max = vld1_dup_s8(&params->neon.output_max);
+    $else:
+      const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+      const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    $for M in range(MR):
+      $for N in range(0, NR, 16):
+        $if N + 8 < NR:
+          vout${M}x${ABC[N:N+16]} = vmaxq_s8(vout${M}x${ABC[N:N+16]}, voutput_min);
+        $elif M % 2 == 1:
+          vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vmaxq_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}, voutput_min);
+        $elif M + 1 == MR:
+          $if NR == 8 and MR == 1:
+            vout${M}x${ABC[N:N+8]} = vmax_s8(vout${M}x${ABC[N:N+8]}, voutput_min);
+          $else:
+            vout${M}x${ABC[N:N+8]} = vmax_s8(vout${M}x${ABC[N:N+8]}, vget_low_s8(voutput_min));
+
+    $for M in range(MR):
+      $for N in range(0, NR, 16):
+        $if N + 8 < NR:
+          vout${M}x${ABC[N:N+16]} = vminq_s8(vout${M}x${ABC[N:N+16]}, voutput_max);
+        $elif M % 2 == 1:
+          vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vminq_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}, voutput_max);
+        $elif M + 1 == MR:
+          $if NR == 8 and MR == 1:
+            vout${M}x${ABC[N:N+8]} = vmin_s8(vout${M}x${ABC[N:N+8]}, voutput_max);
+          $else:
+            vout${M}x${ABC[N:N+8]} = vmin_s8(vout${M}x${ABC[N:N+8]}, vget_low_s8(voutput_max));
+
+    if (nc >= ${NR}) {
+      $for M in range(MR):
+        $for N in range(0, NR, 16):
+          $if N + 8 < NR:
+            vst1q_s8(c${M} + ${N}, vout${M}x${ABC[N:N+16]});
+          $elif M % 2 == 1:
+            vst1_s8(c${M-1} + ${N}, vget_low_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}));
+            vst1_s8(c${M} + ${N}, vget_high_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}));
+          $elif M + 1 == MR:
+            vst1_s8(c${M} + ${N}, vout${M}x${ABC[N:N+8]});
+
+      $for M in range(MR):
+        c${M} = (int8_t*) ((uintptr_t) c${M} + cn_stride);
+
+      $for M in range(MR):
+        a${M} = (const int8_t*) ((uintptr_t) a${M} - kc);
+
+      nc -= ${NR};
+    } else {
+      $if NR == 16:
+        $for M in range(MR):
+          $if M % 2 == 1:
+            int8x16_t vout${M-1}x01234567_${M}x01234567 = vcombine_s8(vget_low_s8(vout${M-1}x0123456789ABCDEF), vget_low_s8(vout${M}x0123456789ABCDEF));
+          $elif M + 1 == MR:
+            int8x8_t vout${M}x01234567 = vget_low_s8(vout${M}x0123456789ABCDEF);
+        if (nc & 8) {
+          $for M in range(MR):
+            $if M % 2 == 1:
+              vst1_s8(c${M-1}, vget_low_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]})); c${M-1} += 8;
+              vst1_s8(c${M}, vget_high_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]})); c${M} += 8;
+            $elif M + 1 == MR:
+              vst1_s8(c${M}, vout${M}x${ABC[N:N+8]}); c${M} += 8;
+          $for M in range(MR):
+            $if M % 2 == 1:
+              vout${M-1}x01234567_${M}x01234567 = vcombine_s8(vget_high_s8(vout${M-1}x0123456789ABCDEF), vget_high_s8(vout${M}x0123456789ABCDEF));
+            $elif M + 1 == MR:
+              vout${M}x01234567 = vget_high_s8(vout${M}x0123456789ABCDEF);
+        }
+      if (nc & 4) {
+        $for M in range(MR):
+          $if M % 2 == 1:
+            vst1q_lane_u32(__builtin_assume_aligned(c${M-1}, 1), vreinterpretq_u32_s8(vout${M-1}x01234567_${M}x01234567), 0); c${M-1} += 4;
+            vst1q_lane_u32(__builtin_assume_aligned(c${M}, 1), vreinterpretq_u32_s8(vout${M-1}x01234567_${M}x01234567), 2); c${M} += 4;
+          $elif M + 1 == MR:
+            vst1_lane_u32(__builtin_assume_aligned(c${M}, 1), vreinterpret_u32_s8(vout${M}x01234567), 0); c${M} += 4;
+        $for M in range(MR):
+          $if M % 2 == 1:
+            vout${M-1}x01234567_${M}x01234567 = vextq_s8(vout${M-1}x01234567_${M}x01234567, vout${M-1}x01234567_${M}x01234567, 4);
+          $elif M + 1 == MR:
+            vout${M}x01234567 = vext_s8(vout${M}x01234567, vout${M}x01234567, 4);
+      }
+      if (nc & 2) {
+        $for M in range(MR):
+          $if M % 2 == 1:
+            vst1q_lane_u16(__builtin_assume_aligned(c${M-1}, 1), vreinterpretq_u16_s8(vout${M-1}x01234567_${M}x01234567), 0); c${M-1} += 2;
+            vst1q_lane_u16(__builtin_assume_aligned(c${M}, 1), vreinterpretq_u16_s8(vout${M-1}x01234567_${M}x01234567), 4); c${M} += 2;
+          $elif M + 1 == MR:
+            vst1_lane_u16(__builtin_assume_aligned(c${M}, 1), vreinterpret_u16_s8(vout${M}x01234567), 0); c${M} += 2;
+        $for M in range(MR):
+          $if M % 2 == 1:
+            vout${M-1}x01234567_${M}x01234567 = vextq_s8(vout${M-1}x01234567_${M}x01234567, vout${M-1}x01234567_${M}x01234567, 2);
+          $elif M + 1 == MR:
+            vout${M}x01234567 = vext_s8(vout${M}x01234567, vout${M}x01234567, 2);
+      }
+      if (nc & 1) {
+        $for M in range(MR):
+          $if M % 2 == 1:
+            vst1q_lane_s8(c${M-1}, vout${M-1}x01234567_${M}x01234567, 0);
+            vst1q_lane_s8(c${M}, vout${M-1}x01234567_${M}x01234567, 8);
+          $elif M + 1 == MR:
+            vst1_lane_s8(c${M}, vout${M}x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/c8-neon-mull-padal.c.in b/src/qs8-gemm/c8-neon-mull-padal.c.in
new file mode 100644
index 0000000..6ba05b5
--- /dev/null
+++ b/src/qs8-gemm/c8-neon-mull-padal.c.in
@@ -0,0 +1,274 @@
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+$assert NR % 8 == 0
+$assert 8 <= NR <= 16
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_${MR}x${NR}c8__neon_${"mlal" if MLA else "mull"}_padal(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= ${MR});
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  $for M in range(1, MR):
+    const int8_t* a${M} = (const int8_t*) ((uintptr_t) a${M-1} + a_stride);
+    int8_t* c${M} = (int8_t*) ((uintptr_t) c${M-1} + cm_stride);
+    $if M % 2 == 0:
+      if XNN_UNPREDICTABLE(mr <= ${M}) {
+        a${M} = a${M-1};
+        c${M} = c${M-1};
+      }
+    $elif M + 1 == MR:
+      if XNN_UNPREDICTABLE(mr != ${M+1}) {
+        a${M} = a${M-1};
+        c${M} = c${M-1};
+      }
+    $else:
+      if XNN_UNPREDICTABLE(mr < ${M+1}) {
+        a${M} = a${M-1};
+        c${M} = c${M-1};
+      }
+
+  do {
+    $for N in range(NR):
+      int32x4_t vacc0x${N} = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    $for M in range(1, MR):
+      $for N in range(NR):
+        int32x4_t vacc${M}x${N} = vacc0x${N};
+
+    size_t k = kc;
+    $if MLA:
+      // 2x partial unrolled loop to load 16 bytes at a time using MLA.
+      while (k >= 16 * sizeof(int8_t)) {
+        $for M in range(MR):
+          const int8x8_t va${M}x0 = vld1_s8(a${M}); a${M} += 8;
+          const int8x8_t va${M}x1 = vld1_s8(a${M}); a${M} += 8;
+
+        $for N in range(NR):
+          const int8x8_t vb${N}x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+
+        $for N in range(NR):
+          const int8x8_t vb${N}x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+          $for M in range(MR):
+            int16x8_t vprod${M}x${N} = vmull_s8(vb${N}x0, va${M}x0);
+          $for M in range(MR):
+            vprod${M}x${N} = vmlal_s8(vprod${M}x${N}, vb${N}x1, va${M}x1);
+          $for M in range(MR):
+            vacc${M}x${N} = vpadalq_s16(vacc${M}x${N}, vprod${M}x${N});
+
+        k -= 16 * sizeof(int8_t);
+      }
+
+    // Handle 8 bytes at a time using MUL.
+    ${"if" if MLA else "while"} (k > 0) {
+      $for M in range(MR):
+        const int8x8_t va${M} = vld1_s8(a${M}); a${M} += 8;
+
+      $for N in range(NR):
+        const int8x8_t vb${N} = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        $for M in range(MR):
+          const int16x8_t vprod${M}x${N} = vmull_s8(vb${N}, va${M});
+        $for M in range(MR):
+          vacc${M}x${N} = vpadalq_s16(vacc${M}x${N}, vprod${M}x${N});
+
+      k -= 8 * sizeof(int8_t);
+    }
+
+#if XNN_ARCH_ARM64
+    $for M in range(MR):
+      $for N in range(0, NR, 4):
+        const int32x4_t vsum${M}x${ABC[N:N+2]} = vpaddq_s32(vacc${M}x${N}, vacc${M}x${N+1});
+        const int32x4_t vsum${M}x${ABC[N+2:N+4]} = vpaddq_s32(vacc${M}x${N+2}, vacc${M}x${N+3});
+    $for M in range(MR):
+      $for N in range(0, NR, 4):
+        int32x4_t vacc${M}x${ABC[N:N+4]} = vpaddq_s32(vsum${M}x${ABC[N:N+2]}, vsum${M}x${ABC[N+2:N+4]});
+#else
+    $for M in range(MR):
+      $for N in range(0, NR, 4):
+        const int32x2_t vpsum${M}x${ABC[N]} = vadd_s32(vget_low_s32(vacc${M}x${N}), vget_high_s32(vacc${M}x${N}));
+        const int32x2_t vpsum${M}x${ABC[N+1]} = vadd_s32(vget_low_s32(vacc${M}x${N+1}), vget_high_s32(vacc${M}x${N+1}));
+        const int32x2_t vpsum${M}x${ABC[N+2]} = vadd_s32(vget_low_s32(vacc${M}x${N+2}), vget_high_s32(vacc${M}x${N+2}));
+        const int32x2_t vpsum${M}x${ABC[N+3]} = vadd_s32(vget_low_s32(vacc${M}x${N+3}), vget_high_s32(vacc${M}x${N+3}));
+        const int32x2_t vsum${M}x${ABC[N:N+2]} = vpadd_s32(vpsum${M}x${ABC[N]}, vpsum${M}x${ABC[N+1]});
+        const int32x2_t vsum${M}x${ABC[N+2:N+4]} = vpadd_s32(vpsum${M}x${ABC[N+2]}, vpsum${M}x${ABC[N+3]});
+        int32x4_t vacc${M}x${ABC[N:N+4]} = vcombine_s32(vsum${M}x${ABC[N:N+2]}, vsum${M}x${ABC[N+2:N+4]} );
+#endif
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    $for M in range(MR):
+      $for N in range(0, NR, 4):
+        vacc${M}x${ABC[N:N+4]} = vqrdmulhq_s32(vacc${M}x${ABC[N:N+4]}, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    $for M in range(MR):
+      $for N in range(0, NR, 4):
+        vacc${M}x${ABC[N:N+4]} = vsraq_n_s32(vacc${M}x${ABC[N:N+4]}, vbicq_s32(vacc${M}x${ABC[N:N+4]}, vzero_shift_mask), 31);
+
+    $for M in range(MR):
+      $for N in range(0, NR, 4):
+        vacc${M}x${ABC[N:N+4]} = vrshlq_s32(vacc${M}x${ABC[N:N+4]}, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    $for M in range(MR):
+      $for N in range(0, NR, 8):
+        const int16x8_t vacc${M}x${ABC[N:N+8]} = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc${M}x${ABC[N:N+4]}), vacc${M}x${ABC[N+4:N+8]}), voutput_zero_point);
+    $for M in range(MR):
+      $for N in range(0, NR, 16):
+        $if N + 8 < NR:
+          int8x16_t vout${M}x${ABC[N:N+16]} = vqmovn_high_s16(vqmovn_s16(vacc${M}x${ABC[N:N+8]}), vacc${M}x${ABC[N+8:N+16]});
+        $elif M % 2 == 1:
+          int8x16_t vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vqmovn_high_s16(vqmovn_s16(vacc${M-1}x${ABC[N:N+8]}), vacc${M}x${ABC[N:N+8]});
+        $elif M + 1 == MR:
+          int8x8_t vout${M}x${ABC[N:N+8]} = vqmovn_s16(vacc${M}x${ABC[N:N+8]});
+#else
+    $for M in range(MR):
+      $for N in range(0, NR, 8):
+        const int16x8_t vacc${M}x${ABC[N:N+8]} = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc${M}x${ABC[N:N+4]}), vqmovn_s32(vacc${M}x${ABC[N+4:N+8]})), voutput_zero_point);
+
+    $for M in range(MR):
+      $for N in range(0, NR, 16):
+        $if N + 8 < NR:
+          int8x16_t vout${M}x${ABC[N:N+16]} = vcombine_s8(vqmovn_s16(vacc${M}x${ABC[N:N+8]}), vqmovn_s16(vacc${M}x${ABC[N+8:N+16]}));
+        $elif M % 2 == 1:
+          int8x16_t vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vcombine_s8(vqmovn_s16(vacc${M-1}x${ABC[N:N+8]}), vqmovn_s16(vacc${M}x${ABC[N:N+8]}));
+        $elif M + 1 == MR:
+          int8x8_t vout${M}x${ABC[N:N+8]} = vqmovn_s16(vacc${M}x${ABC[N:N+8]});
+#endif
+    $if NR == 8 and MR == 1:
+      const int8x8_t voutput_min = vld1_dup_s8(&params->neon.output_min);
+      const int8x8_t voutput_max = vld1_dup_s8(&params->neon.output_max);
+    $else:
+      const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+      const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    $for M in range(MR):
+      $for N in range(0, NR, 16):
+        $if N + 8 < NR:
+          vout${M}x${ABC[N:N+16]} = vmaxq_s8(vout${M}x${ABC[N:N+16]}, voutput_min);
+        $elif M % 2 == 1:
+          vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vmaxq_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}, voutput_min);
+        $elif M + 1 == MR:
+          $if NR == 8 and MR == 1:
+            vout${M}x${ABC[N:N+8]} = vmax_s8(vout${M}x${ABC[N:N+8]}, voutput_min);
+          $else:
+            vout${M}x${ABC[N:N+8]} = vmax_s8(vout${M}x${ABC[N:N+8]}, vget_low_s8(voutput_min));
+
+    $for M in range(MR):
+      $for N in range(0, NR, 16):
+        $if N + 8 < NR:
+          vout${M}x${ABC[N:N+16]} = vminq_s8(vout${M}x${ABC[N:N+16]}, voutput_max);
+        $elif M % 2 == 1:
+          vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vminq_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}, voutput_max);
+        $elif M + 1 == MR:
+          $if NR == 8 and MR == 1:
+            vout${M}x${ABC[N:N+8]} = vmin_s8(vout${M}x${ABC[N:N+8]}, voutput_max);
+          $else:
+            vout${M}x${ABC[N:N+8]} = vmin_s8(vout${M}x${ABC[N:N+8]}, vget_low_s8(voutput_max));
+
+    if (nc >= ${NR}) {
+      $for M in range(MR):
+        $for N in range(0, NR, 16):
+          $if N + 8 < NR:
+            vst1q_s8(c${M} + ${N}, vout${M}x${ABC[N:N+16]});
+          $elif M % 2 == 1:
+            vst1_s8(c${M-1} + ${N}, vget_low_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}));
+            vst1_s8(c${M} + ${N}, vget_high_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}));
+          $elif M + 1 == MR:
+            vst1_s8(c${M} + ${N}, vout${M}x${ABC[N:N+8]});
+
+      $for M in range(MR):
+        c${M} = (int8_t*) ((uintptr_t) c${M} + cn_stride);
+
+      $for M in range(MR):
+        a${M} = (const int8_t*) ((uintptr_t) a${M} - kc);
+
+      nc -= ${NR};
+    } else {
+      $if NR == 16:
+        $for M in range(MR):
+          $if M % 2 == 1:
+            int8x16_t vout${M-1}x01234567_${M}x01234567 = vcombine_s8(vget_low_s8(vout${M-1}x0123456789ABCDEF), vget_low_s8(vout${M}x0123456789ABCDEF));
+          $elif M + 1 == MR:
+            int8x8_t vout${M}x01234567 = vget_low_s8(vout${M}x0123456789ABCDEF);
+        if (nc & 8) {
+          $for M in range(MR):
+            $if M % 2 == 1:
+              vst1_s8(c${M-1}, vget_low_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]})); c${M-1} += 8;
+              vst1_s8(c${M}, vget_high_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]})); c${M} += 8;
+            $elif M + 1 == MR:
+              vst1_s8(c${M}, vout${M}x${ABC[N:N+8]}); c${M} += 8;
+          $for M in range(MR):
+            $if M % 2 == 1:
+              vout${M-1}x01234567_${M}x01234567 = vcombine_s8(vget_high_s8(vout${M-1}x0123456789ABCDEF), vget_high_s8(vout${M}x0123456789ABCDEF));
+            $elif M + 1 == MR:
+              vout${M}x01234567 = vget_high_s8(vout${M}x0123456789ABCDEF);
+        }
+      if (nc & 4) {
+        $for M in range(MR):
+          $if M % 2 == 1:
+            vst1q_lane_u32(__builtin_assume_aligned(c${M-1}, 1), vreinterpretq_u32_s8(vout${M-1}x01234567_${M}x01234567), 0); c${M-1} += 4;
+            vst1q_lane_u32(__builtin_assume_aligned(c${M}, 1), vreinterpretq_u32_s8(vout${M-1}x01234567_${M}x01234567), 2); c${M} += 4;
+          $elif M + 1 == MR:
+            vst1_lane_u32(__builtin_assume_aligned(c${M}, 1), vreinterpret_u32_s8(vout${M}x01234567), 0); c${M} += 4;
+        $for M in range(MR):
+          $if M % 2 == 1:
+            vout${M-1}x01234567_${M}x01234567 = vextq_s8(vout${M-1}x01234567_${M}x01234567, vout${M-1}x01234567_${M}x01234567, 4);
+          $elif M + 1 == MR:
+            vout${M}x01234567 = vext_s8(vout${M}x01234567, vout${M}x01234567, 4);
+      }
+      if (nc & 2) {
+        $for M in range(MR):
+          $if M % 2 == 1:
+            vst1q_lane_u16(__builtin_assume_aligned(c${M-1}, 1), vreinterpretq_u16_s8(vout${M-1}x01234567_${M}x01234567), 0); c${M-1} += 2;
+            vst1q_lane_u16(__builtin_assume_aligned(c${M}, 1), vreinterpretq_u16_s8(vout${M-1}x01234567_${M}x01234567), 4); c${M} += 2;
+          $elif M + 1 == MR:
+            vst1_lane_u16(__builtin_assume_aligned(c${M}, 1), vreinterpret_u16_s8(vout${M}x01234567), 0); c${M} += 2;
+        $for M in range(MR):
+          $if M % 2 == 1:
+            vout${M-1}x01234567_${M}x01234567 = vextq_s8(vout${M-1}x01234567_${M}x01234567, vout${M-1}x01234567_${M}x01234567, 2);
+          $elif M + 1 == MR:
+            vout${M}x01234567 = vext_s8(vout${M}x01234567, vout${M}x01234567, 2);
+      }
+      if (nc & 1) {
+        $for M in range(MR):
+          $if M % 2 == 1:
+            vst1q_lane_s8(c${M-1}, vout${M-1}x01234567_${M}x01234567, 0);
+            vst1q_lane_s8(c${M}, vout${M-1}x01234567_${M}x01234567, 8);
+          $elif M + 1 == MR:
+            vst1_lane_s8(c${M}, vout${M}x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/12x4c4-minmax-scalar.c b/src/qs8-gemm/gen/12x4c4-minmax-scalar.c
deleted file mode 100644
index db87ecb..0000000
--- a/src/qs8-gemm/gen/12x4c4-minmax-scalar.c
+++ /dev/null
@@ -1,945 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/qs8-gemm/MRxNRc4-scalar.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <xnnpack/gemm.h>
-
-#include <xnnpack/scalar-utils.h>
-
-// This kernel is a scalar model for a kernel using ARMv8.2 dot-product
-// instructions.
-//
-// XNN_DISABLE_TSAN is used because this kernel reads up to 3 bytes past the
-// bounds of the `a` matrix region, which may be a race condition with
-// another thread. We deem this acceptable because the values that are
-// read out of bounds do not affect the result, and the the compiler can't know
-// about this undefined behavior.
-void xnn_qs8_gemm_minmax_ukernel_12x4c4__scalar(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    const int8_t* restrict a,
-    size_t a_stride,
-    const void* restrict w,
-    int8_t* restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN {
-  assert(mr != 0);
-  assert(mr <= 12);
-  assert(nc != 0);
-  assert(kc != 0);
-
-  const int8_t* a0 = a;
-  int8_t* c0 = c;
-  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
-  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 2) {
-    a1 = a0;
-    c1 = c0;
-  }
-  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
-  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 2) {
-    a2 = a1;
-    c2 = c1;
-  }
-  const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride);
-  int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 4) {
-    a3 = a2;
-    c3 = c2;
-  }
-  const int8_t* a4 = (const int8_t*) ((uintptr_t) a3 + a_stride);
-  int8_t* c4 = (int8_t*) ((uintptr_t) c3 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 4) {
-    a4 = a3;
-    c4 = c3;
-  }
-  const int8_t* a5 = (const int8_t*) ((uintptr_t) a4 + a_stride);
-  int8_t* c5 = (int8_t*) ((uintptr_t) c4 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 6) {
-    a5 = a4;
-    c5 = c4;
-  }
-  const int8_t* a6 = (const int8_t*) ((uintptr_t) a5 + a_stride);
-  int8_t* c6 = (int8_t*) ((uintptr_t) c5 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 6) {
-    a6 = a5;
-    c6 = c5;
-  }
-  const int8_t* a7 = (const int8_t*) ((uintptr_t) a6 + a_stride);
-  int8_t* c7 = (int8_t*) ((uintptr_t) c6 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 8) {
-    a7 = a6;
-    c7 = c6;
-  }
-  const int8_t* a8 = (const int8_t*) ((uintptr_t) a7 + a_stride);
-  int8_t* c8 = (int8_t*) ((uintptr_t) c7 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 8) {
-    a8 = a7;
-    c8 = c7;
-  }
-  const int8_t* a9 = (const int8_t*) ((uintptr_t) a8 + a_stride);
-  int8_t* c9 = (int8_t*) ((uintptr_t) c8 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 10) {
-    a9 = a8;
-    c9 = c8;
-  }
-  const int8_t* a10 = (const int8_t*) ((uintptr_t) a9 + a_stride);
-  int8_t* c10 = (int8_t*) ((uintptr_t) c9 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 10) {
-    a10 = a9;
-    c10 = c9;
-  }
-  const int8_t* a11 = (const int8_t*) ((uintptr_t) a10 + a_stride);
-  int8_t* c11 = (int8_t*) ((uintptr_t) c10 + cm_stride);
-  if XNN_UNPREDICTABLE(mr != 12) {
-    a11 = a10;
-    c11 = c10;
-  }
-
-  // Loop over groups of 4 columns.
-  do {
-    // `vaccMN` is the accumulator at row `M`, column `N`.
-    // Initialize accumulators with bias. 4 bias values are loaded from the
-    // weight matrix, at the start of the group of 4 columns.
-    int32_t bias0 = ((const int32_t*)w)[0];
-    int32_t vacc00 = bias0;
-    int32_t vacc10 = bias0;
-    int32_t vacc20 = bias0;
-    int32_t vacc30 = bias0;
-    int32_t vacc40 = bias0;
-    int32_t vacc50 = bias0;
-    int32_t vacc60 = bias0;
-    int32_t vacc70 = bias0;
-    int32_t vacc80 = bias0;
-    int32_t vacc90 = bias0;
-    int32_t vacc100 = bias0;
-    int32_t vacc110 = bias0;
-    int32_t bias1 = ((const int32_t*)w)[1];
-    int32_t vacc01 = bias1;
-    int32_t vacc11 = bias1;
-    int32_t vacc21 = bias1;
-    int32_t vacc31 = bias1;
-    int32_t vacc41 = bias1;
-    int32_t vacc51 = bias1;
-    int32_t vacc61 = bias1;
-    int32_t vacc71 = bias1;
-    int32_t vacc81 = bias1;
-    int32_t vacc91 = bias1;
-    int32_t vacc101 = bias1;
-    int32_t vacc111 = bias1;
-    int32_t bias2 = ((const int32_t*)w)[2];
-    int32_t vacc02 = bias2;
-    int32_t vacc12 = bias2;
-    int32_t vacc22 = bias2;
-    int32_t vacc32 = bias2;
-    int32_t vacc42 = bias2;
-    int32_t vacc52 = bias2;
-    int32_t vacc62 = bias2;
-    int32_t vacc72 = bias2;
-    int32_t vacc82 = bias2;
-    int32_t vacc92 = bias2;
-    int32_t vacc102 = bias2;
-    int32_t vacc112 = bias2;
-    int32_t bias3 = ((const int32_t*)w)[3];
-    int32_t vacc03 = bias3;
-    int32_t vacc13 = bias3;
-    int32_t vacc23 = bias3;
-    int32_t vacc33 = bias3;
-    int32_t vacc43 = bias3;
-    int32_t vacc53 = bias3;
-    int32_t vacc63 = bias3;
-    int32_t vacc73 = bias3;
-    int32_t vacc83 = bias3;
-    int32_t vacc93 = bias3;
-    int32_t vacc103 = bias3;
-    int32_t vacc113 = bias3;
-
-    w = (const void*)((uintptr_t)w + 4 * sizeof(int32_t));
-
-    // Inner accumulation loop along the 4 columns.
-    // Handle 4 rows at each iteration: this is key to modelling what an
-    // actual kernel using ARMv8.2 dot-product instructions would look like.
-    size_t k = 0;
-    while (k < kc) {
-      // Load a 12x4 block of activations.
-      int32_t va00 = *a0++;
-      int32_t va01 = *a0++;
-      int32_t va02 = *a0++;
-      int32_t va03 = *a0++;
-      int32_t va10 = *a1++;
-      int32_t va11 = *a1++;
-      int32_t va12 = *a1++;
-      int32_t va13 = *a1++;
-      int32_t va20 = *a2++;
-      int32_t va21 = *a2++;
-      int32_t va22 = *a2++;
-      int32_t va23 = *a2++;
-      int32_t va30 = *a3++;
-      int32_t va31 = *a3++;
-      int32_t va32 = *a3++;
-      int32_t va33 = *a3++;
-      int32_t va40 = *a4++;
-      int32_t va41 = *a4++;
-      int32_t va42 = *a4++;
-      int32_t va43 = *a4++;
-      int32_t va50 = *a5++;
-      int32_t va51 = *a5++;
-      int32_t va52 = *a5++;
-      int32_t va53 = *a5++;
-      int32_t va60 = *a6++;
-      int32_t va61 = *a6++;
-      int32_t va62 = *a6++;
-      int32_t va63 = *a6++;
-      int32_t va70 = *a7++;
-      int32_t va71 = *a7++;
-      int32_t va72 = *a7++;
-      int32_t va73 = *a7++;
-      int32_t va80 = *a8++;
-      int32_t va81 = *a8++;
-      int32_t va82 = *a8++;
-      int32_t va83 = *a8++;
-      int32_t va90 = *a9++;
-      int32_t va91 = *a9++;
-      int32_t va92 = *a9++;
-      int32_t va93 = *a9++;
-      int32_t va100 = *a10++;
-      int32_t va101 = *a10++;
-      int32_t va102 = *a10++;
-      int32_t va103 = *a10++;
-      int32_t va110 = *a11++;
-      int32_t va111 = *a11++;
-      int32_t va112 = *a11++;
-      int32_t va113 = *a11++;
-
-      // Load a 4x4 block of weights.
-      int32_t vb00 = ((const int8_t*)w)[0];
-      int32_t vb10 = ((const int8_t*)w)[1];
-      int32_t vb20 = ((const int8_t*)w)[2];
-      int32_t vb30 = ((const int8_t*)w)[3];
-
-      w = (const void*)((uintptr_t)w + 4 * sizeof(int8_t));
-      int32_t vb01 = ((const int8_t*)w)[0];
-      int32_t vb11 = ((const int8_t*)w)[1];
-      int32_t vb21 = ((const int8_t*)w)[2];
-      int32_t vb31 = ((const int8_t*)w)[3];
-
-      w = (const void*)((uintptr_t)w + 4 * sizeof(int8_t));
-      int32_t vb02 = ((const int8_t*)w)[0];
-      int32_t vb12 = ((const int8_t*)w)[1];
-      int32_t vb22 = ((const int8_t*)w)[2];
-      int32_t vb32 = ((const int8_t*)w)[3];
-
-      w = (const void*)((uintptr_t)w + 4 * sizeof(int8_t));
-      int32_t vb03 = ((const int8_t*)w)[0];
-      int32_t vb13 = ((const int8_t*)w)[1];
-      int32_t vb23 = ((const int8_t*)w)[2];
-      int32_t vb33 = ((const int8_t*)w)[3];
-
-      w = (const void*)((uintptr_t)w + 4 * sizeof(int8_t));
-
-      // Multiply-accumulate: 12x4 * 4x4 --> 12x4. The inner size 4 here means
-      // we're computing 4D dot-products, which makes this a model for
-      // a ARMv8.2 dot-product kernel.
-      vacc00 += va00 * vb00;
-      vacc00 += va01 * vb10;
-      vacc00 += va02 * vb20;
-      vacc00 += va03 * vb30;
-      vacc01 += va00 * vb01;
-      vacc01 += va01 * vb11;
-      vacc01 += va02 * vb21;
-      vacc01 += va03 * vb31;
-      vacc02 += va00 * vb02;
-      vacc02 += va01 * vb12;
-      vacc02 += va02 * vb22;
-      vacc02 += va03 * vb32;
-      vacc03 += va00 * vb03;
-      vacc03 += va01 * vb13;
-      vacc03 += va02 * vb23;
-      vacc03 += va03 * vb33;
-      vacc10 += va10 * vb00;
-      vacc10 += va11 * vb10;
-      vacc10 += va12 * vb20;
-      vacc10 += va13 * vb30;
-      vacc11 += va10 * vb01;
-      vacc11 += va11 * vb11;
-      vacc11 += va12 * vb21;
-      vacc11 += va13 * vb31;
-      vacc12 += va10 * vb02;
-      vacc12 += va11 * vb12;
-      vacc12 += va12 * vb22;
-      vacc12 += va13 * vb32;
-      vacc13 += va10 * vb03;
-      vacc13 += va11 * vb13;
-      vacc13 += va12 * vb23;
-      vacc13 += va13 * vb33;
-      vacc20 += va20 * vb00;
-      vacc20 += va21 * vb10;
-      vacc20 += va22 * vb20;
-      vacc20 += va23 * vb30;
-      vacc21 += va20 * vb01;
-      vacc21 += va21 * vb11;
-      vacc21 += va22 * vb21;
-      vacc21 += va23 * vb31;
-      vacc22 += va20 * vb02;
-      vacc22 += va21 * vb12;
-      vacc22 += va22 * vb22;
-      vacc22 += va23 * vb32;
-      vacc23 += va20 * vb03;
-      vacc23 += va21 * vb13;
-      vacc23 += va22 * vb23;
-      vacc23 += va23 * vb33;
-      vacc30 += va30 * vb00;
-      vacc30 += va31 * vb10;
-      vacc30 += va32 * vb20;
-      vacc30 += va33 * vb30;
-      vacc31 += va30 * vb01;
-      vacc31 += va31 * vb11;
-      vacc31 += va32 * vb21;
-      vacc31 += va33 * vb31;
-      vacc32 += va30 * vb02;
-      vacc32 += va31 * vb12;
-      vacc32 += va32 * vb22;
-      vacc32 += va33 * vb32;
-      vacc33 += va30 * vb03;
-      vacc33 += va31 * vb13;
-      vacc33 += va32 * vb23;
-      vacc33 += va33 * vb33;
-      vacc40 += va40 * vb00;
-      vacc40 += va41 * vb10;
-      vacc40 += va42 * vb20;
-      vacc40 += va43 * vb30;
-      vacc41 += va40 * vb01;
-      vacc41 += va41 * vb11;
-      vacc41 += va42 * vb21;
-      vacc41 += va43 * vb31;
-      vacc42 += va40 * vb02;
-      vacc42 += va41 * vb12;
-      vacc42 += va42 * vb22;
-      vacc42 += va43 * vb32;
-      vacc43 += va40 * vb03;
-      vacc43 += va41 * vb13;
-      vacc43 += va42 * vb23;
-      vacc43 += va43 * vb33;
-      vacc50 += va50 * vb00;
-      vacc50 += va51 * vb10;
-      vacc50 += va52 * vb20;
-      vacc50 += va53 * vb30;
-      vacc51 += va50 * vb01;
-      vacc51 += va51 * vb11;
-      vacc51 += va52 * vb21;
-      vacc51 += va53 * vb31;
-      vacc52 += va50 * vb02;
-      vacc52 += va51 * vb12;
-      vacc52 += va52 * vb22;
-      vacc52 += va53 * vb32;
-      vacc53 += va50 * vb03;
-      vacc53 += va51 * vb13;
-      vacc53 += va52 * vb23;
-      vacc53 += va53 * vb33;
-      vacc60 += va60 * vb00;
-      vacc60 += va61 * vb10;
-      vacc60 += va62 * vb20;
-      vacc60 += va63 * vb30;
-      vacc61 += va60 * vb01;
-      vacc61 += va61 * vb11;
-      vacc61 += va62 * vb21;
-      vacc61 += va63 * vb31;
-      vacc62 += va60 * vb02;
-      vacc62 += va61 * vb12;
-      vacc62 += va62 * vb22;
-      vacc62 += va63 * vb32;
-      vacc63 += va60 * vb03;
-      vacc63 += va61 * vb13;
-      vacc63 += va62 * vb23;
-      vacc63 += va63 * vb33;
-      vacc70 += va70 * vb00;
-      vacc70 += va71 * vb10;
-      vacc70 += va72 * vb20;
-      vacc70 += va73 * vb30;
-      vacc71 += va70 * vb01;
-      vacc71 += va71 * vb11;
-      vacc71 += va72 * vb21;
-      vacc71 += va73 * vb31;
-      vacc72 += va70 * vb02;
-      vacc72 += va71 * vb12;
-      vacc72 += va72 * vb22;
-      vacc72 += va73 * vb32;
-      vacc73 += va70 * vb03;
-      vacc73 += va71 * vb13;
-      vacc73 += va72 * vb23;
-      vacc73 += va73 * vb33;
-      vacc80 += va80 * vb00;
-      vacc80 += va81 * vb10;
-      vacc80 += va82 * vb20;
-      vacc80 += va83 * vb30;
-      vacc81 += va80 * vb01;
-      vacc81 += va81 * vb11;
-      vacc81 += va82 * vb21;
-      vacc81 += va83 * vb31;
-      vacc82 += va80 * vb02;
-      vacc82 += va81 * vb12;
-      vacc82 += va82 * vb22;
-      vacc82 += va83 * vb32;
-      vacc83 += va80 * vb03;
-      vacc83 += va81 * vb13;
-      vacc83 += va82 * vb23;
-      vacc83 += va83 * vb33;
-      vacc90 += va90 * vb00;
-      vacc90 += va91 * vb10;
-      vacc90 += va92 * vb20;
-      vacc90 += va93 * vb30;
-      vacc91 += va90 * vb01;
-      vacc91 += va91 * vb11;
-      vacc91 += va92 * vb21;
-      vacc91 += va93 * vb31;
-      vacc92 += va90 * vb02;
-      vacc92 += va91 * vb12;
-      vacc92 += va92 * vb22;
-      vacc92 += va93 * vb32;
-      vacc93 += va90 * vb03;
-      vacc93 += va91 * vb13;
-      vacc93 += va92 * vb23;
-      vacc93 += va93 * vb33;
-      vacc100 += va100 * vb00;
-      vacc100 += va101 * vb10;
-      vacc100 += va102 * vb20;
-      vacc100 += va103 * vb30;
-      vacc101 += va100 * vb01;
-      vacc101 += va101 * vb11;
-      vacc101 += va102 * vb21;
-      vacc101 += va103 * vb31;
-      vacc102 += va100 * vb02;
-      vacc102 += va101 * vb12;
-      vacc102 += va102 * vb22;
-      vacc102 += va103 * vb32;
-      vacc103 += va100 * vb03;
-      vacc103 += va101 * vb13;
-      vacc103 += va102 * vb23;
-      vacc103 += va103 * vb33;
-      vacc110 += va110 * vb00;
-      vacc110 += va111 * vb10;
-      vacc110 += va112 * vb20;
-      vacc110 += va113 * vb30;
-      vacc111 += va110 * vb01;
-      vacc111 += va111 * vb11;
-      vacc111 += va112 * vb21;
-      vacc111 += va113 * vb31;
-      vacc112 += va110 * vb02;
-      vacc112 += va111 * vb12;
-      vacc112 += va112 * vb22;
-      vacc112 += va113 * vb32;
-      vacc113 += va110 * vb03;
-      vacc113 += va111 * vb13;
-      vacc113 += va112 * vb23;
-      vacc113 += va113 * vb33;
-
-      k += 4 * sizeof(int8_t);
-    }
-    // End of accumulation loop. The variable `k` contains the amount by which
-    // we advanced the `va` pointers, so we rewind by this amount now.
-    a0 = (const int8_t*)((uintptr_t)a0 - k);
-    a1 = (const int8_t*)((uintptr_t)a1 - k);
-    a2 = (const int8_t*)((uintptr_t)a2 - k);
-    a3 = (const int8_t*)((uintptr_t)a3 - k);
-    a4 = (const int8_t*)((uintptr_t)a4 - k);
-    a5 = (const int8_t*)((uintptr_t)a5 - k);
-    a6 = (const int8_t*)((uintptr_t)a6 - k);
-    a7 = (const int8_t*)((uintptr_t)a7 - k);
-    a8 = (const int8_t*)((uintptr_t)a8 - k);
-    a9 = (const int8_t*)((uintptr_t)a9 - k);
-    a10 = (const int8_t*)((uintptr_t)a10 - k);
-    a11 = (const int8_t*)((uintptr_t)a11 - k);
-
-    // Post-accumulation work
-
-    const int32_t vmultiplier = params->scalar.multiplier;
-    const int64_t vq31rounding = INT64_C(0x40000000);
-    const int32_t vremainder_mask = params->scalar.remainder_mask;
-    const uint32_t vshift = params->scalar.shift;
-    const int32_t vremainder_threshold = params->scalar.remainder_threshold;
-    const int32_t voutput_min = params->scalar.output_min_less_zero_point;
-    const int32_t voutput_max = params->scalar.output_max_less_zero_point;
-    const int32_t voutput_zero_point = params->scalar.output_zero_point;
-
-    const int64_t vproduct00 = (int64_t)vacc00 * (int64_t)vmultiplier;
-    const int64_t vproduct01 = (int64_t)vacc01 * (int64_t)vmultiplier;
-    const int64_t vproduct02 = (int64_t)vacc02 * (int64_t)vmultiplier;
-    const int64_t vproduct03 = (int64_t)vacc03 * (int64_t)vmultiplier;
-    const int64_t vproduct10 = (int64_t)vacc10 * (int64_t)vmultiplier;
-    const int64_t vproduct11 = (int64_t)vacc11 * (int64_t)vmultiplier;
-    const int64_t vproduct12 = (int64_t)vacc12 * (int64_t)vmultiplier;
-    const int64_t vproduct13 = (int64_t)vacc13 * (int64_t)vmultiplier;
-    const int64_t vproduct20 = (int64_t)vacc20 * (int64_t)vmultiplier;
-    const int64_t vproduct21 = (int64_t)vacc21 * (int64_t)vmultiplier;
-    const int64_t vproduct22 = (int64_t)vacc22 * (int64_t)vmultiplier;
-    const int64_t vproduct23 = (int64_t)vacc23 * (int64_t)vmultiplier;
-    const int64_t vproduct30 = (int64_t)vacc30 * (int64_t)vmultiplier;
-    const int64_t vproduct31 = (int64_t)vacc31 * (int64_t)vmultiplier;
-    const int64_t vproduct32 = (int64_t)vacc32 * (int64_t)vmultiplier;
-    const int64_t vproduct33 = (int64_t)vacc33 * (int64_t)vmultiplier;
-    const int64_t vproduct40 = (int64_t)vacc40 * (int64_t)vmultiplier;
-    const int64_t vproduct41 = (int64_t)vacc41 * (int64_t)vmultiplier;
-    const int64_t vproduct42 = (int64_t)vacc42 * (int64_t)vmultiplier;
-    const int64_t vproduct43 = (int64_t)vacc43 * (int64_t)vmultiplier;
-    const int64_t vproduct50 = (int64_t)vacc50 * (int64_t)vmultiplier;
-    const int64_t vproduct51 = (int64_t)vacc51 * (int64_t)vmultiplier;
-    const int64_t vproduct52 = (int64_t)vacc52 * (int64_t)vmultiplier;
-    const int64_t vproduct53 = (int64_t)vacc53 * (int64_t)vmultiplier;
-    const int64_t vproduct60 = (int64_t)vacc60 * (int64_t)vmultiplier;
-    const int64_t vproduct61 = (int64_t)vacc61 * (int64_t)vmultiplier;
-    const int64_t vproduct62 = (int64_t)vacc62 * (int64_t)vmultiplier;
-    const int64_t vproduct63 = (int64_t)vacc63 * (int64_t)vmultiplier;
-    const int64_t vproduct70 = (int64_t)vacc70 * (int64_t)vmultiplier;
-    const int64_t vproduct71 = (int64_t)vacc71 * (int64_t)vmultiplier;
-    const int64_t vproduct72 = (int64_t)vacc72 * (int64_t)vmultiplier;
-    const int64_t vproduct73 = (int64_t)vacc73 * (int64_t)vmultiplier;
-    const int64_t vproduct80 = (int64_t)vacc80 * (int64_t)vmultiplier;
-    const int64_t vproduct81 = (int64_t)vacc81 * (int64_t)vmultiplier;
-    const int64_t vproduct82 = (int64_t)vacc82 * (int64_t)vmultiplier;
-    const int64_t vproduct83 = (int64_t)vacc83 * (int64_t)vmultiplier;
-    const int64_t vproduct90 = (int64_t)vacc90 * (int64_t)vmultiplier;
-    const int64_t vproduct91 = (int64_t)vacc91 * (int64_t)vmultiplier;
-    const int64_t vproduct92 = (int64_t)vacc92 * (int64_t)vmultiplier;
-    const int64_t vproduct93 = (int64_t)vacc93 * (int64_t)vmultiplier;
-    const int64_t vproduct100 = (int64_t)vacc100 * (int64_t)vmultiplier;
-    const int64_t vproduct101 = (int64_t)vacc101 * (int64_t)vmultiplier;
-    const int64_t vproduct102 = (int64_t)vacc102 * (int64_t)vmultiplier;
-    const int64_t vproduct103 = (int64_t)vacc103 * (int64_t)vmultiplier;
-    const int64_t vproduct110 = (int64_t)vacc110 * (int64_t)vmultiplier;
-    const int64_t vproduct111 = (int64_t)vacc111 * (int64_t)vmultiplier;
-    const int64_t vproduct112 = (int64_t)vacc112 * (int64_t)vmultiplier;
-    const int64_t vproduct113 = (int64_t)vacc113 * (int64_t)vmultiplier;
-
-    const int32_t vq31product00 = (int32_t)(uint32_t)((uint64_t)(vproduct00 + vq31rounding) >> 31);
-    const int32_t vq31product01 = (int32_t)(uint32_t)((uint64_t)(vproduct01 + vq31rounding) >> 31);
-    const int32_t vq31product02 = (int32_t)(uint32_t)((uint64_t)(vproduct02 + vq31rounding) >> 31);
-    const int32_t vq31product03 = (int32_t)(uint32_t)((uint64_t)(vproduct03 + vq31rounding) >> 31);
-    const int32_t vq31product10 = (int32_t)(uint32_t)((uint64_t)(vproduct10 + vq31rounding) >> 31);
-    const int32_t vq31product11 = (int32_t)(uint32_t)((uint64_t)(vproduct11 + vq31rounding) >> 31);
-    const int32_t vq31product12 = (int32_t)(uint32_t)((uint64_t)(vproduct12 + vq31rounding) >> 31);
-    const int32_t vq31product13 = (int32_t)(uint32_t)((uint64_t)(vproduct13 + vq31rounding) >> 31);
-    const int32_t vq31product20 = (int32_t)(uint32_t)((uint64_t)(vproduct20 + vq31rounding) >> 31);
-    const int32_t vq31product21 = (int32_t)(uint32_t)((uint64_t)(vproduct21 + vq31rounding) >> 31);
-    const int32_t vq31product22 = (int32_t)(uint32_t)((uint64_t)(vproduct22 + vq31rounding) >> 31);
-    const int32_t vq31product23 = (int32_t)(uint32_t)((uint64_t)(vproduct23 + vq31rounding) >> 31);
-    const int32_t vq31product30 = (int32_t)(uint32_t)((uint64_t)(vproduct30 + vq31rounding) >> 31);
-    const int32_t vq31product31 = (int32_t)(uint32_t)((uint64_t)(vproduct31 + vq31rounding) >> 31);
-    const int32_t vq31product32 = (int32_t)(uint32_t)((uint64_t)(vproduct32 + vq31rounding) >> 31);
-    const int32_t vq31product33 = (int32_t)(uint32_t)((uint64_t)(vproduct33 + vq31rounding) >> 31);
-    const int32_t vq31product40 = (int32_t)(uint32_t)((uint64_t)(vproduct40 + vq31rounding) >> 31);
-    const int32_t vq31product41 = (int32_t)(uint32_t)((uint64_t)(vproduct41 + vq31rounding) >> 31);
-    const int32_t vq31product42 = (int32_t)(uint32_t)((uint64_t)(vproduct42 + vq31rounding) >> 31);
-    const int32_t vq31product43 = (int32_t)(uint32_t)((uint64_t)(vproduct43 + vq31rounding) >> 31);
-    const int32_t vq31product50 = (int32_t)(uint32_t)((uint64_t)(vproduct50 + vq31rounding) >> 31);
-    const int32_t vq31product51 = (int32_t)(uint32_t)((uint64_t)(vproduct51 + vq31rounding) >> 31);
-    const int32_t vq31product52 = (int32_t)(uint32_t)((uint64_t)(vproduct52 + vq31rounding) >> 31);
-    const int32_t vq31product53 = (int32_t)(uint32_t)((uint64_t)(vproduct53 + vq31rounding) >> 31);
-    const int32_t vq31product60 = (int32_t)(uint32_t)((uint64_t)(vproduct60 + vq31rounding) >> 31);
-    const int32_t vq31product61 = (int32_t)(uint32_t)((uint64_t)(vproduct61 + vq31rounding) >> 31);
-    const int32_t vq31product62 = (int32_t)(uint32_t)((uint64_t)(vproduct62 + vq31rounding) >> 31);
-    const int32_t vq31product63 = (int32_t)(uint32_t)((uint64_t)(vproduct63 + vq31rounding) >> 31);
-    const int32_t vq31product70 = (int32_t)(uint32_t)((uint64_t)(vproduct70 + vq31rounding) >> 31);
-    const int32_t vq31product71 = (int32_t)(uint32_t)((uint64_t)(vproduct71 + vq31rounding) >> 31);
-    const int32_t vq31product72 = (int32_t)(uint32_t)((uint64_t)(vproduct72 + vq31rounding) >> 31);
-    const int32_t vq31product73 = (int32_t)(uint32_t)((uint64_t)(vproduct73 + vq31rounding) >> 31);
-    const int32_t vq31product80 = (int32_t)(uint32_t)((uint64_t)(vproduct80 + vq31rounding) >> 31);
-    const int32_t vq31product81 = (int32_t)(uint32_t)((uint64_t)(vproduct81 + vq31rounding) >> 31);
-    const int32_t vq31product82 = (int32_t)(uint32_t)((uint64_t)(vproduct82 + vq31rounding) >> 31);
-    const int32_t vq31product83 = (int32_t)(uint32_t)((uint64_t)(vproduct83 + vq31rounding) >> 31);
-    const int32_t vq31product90 = (int32_t)(uint32_t)((uint64_t)(vproduct90 + vq31rounding) >> 31);
-    const int32_t vq31product91 = (int32_t)(uint32_t)((uint64_t)(vproduct91 + vq31rounding) >> 31);
-    const int32_t vq31product92 = (int32_t)(uint32_t)((uint64_t)(vproduct92 + vq31rounding) >> 31);
-    const int32_t vq31product93 = (int32_t)(uint32_t)((uint64_t)(vproduct93 + vq31rounding) >> 31);
-    const int32_t vq31product100 = (int32_t)(uint32_t)((uint64_t)(vproduct100 + vq31rounding) >> 31);
-    const int32_t vq31product101 = (int32_t)(uint32_t)((uint64_t)(vproduct101 + vq31rounding) >> 31);
-    const int32_t vq31product102 = (int32_t)(uint32_t)((uint64_t)(vproduct102 + vq31rounding) >> 31);
-    const int32_t vq31product103 = (int32_t)(uint32_t)((uint64_t)(vproduct103 + vq31rounding) >> 31);
-    const int32_t vq31product110 = (int32_t)(uint32_t)((uint64_t)(vproduct110 + vq31rounding) >> 31);
-    const int32_t vq31product111 = (int32_t)(uint32_t)((uint64_t)(vproduct111 + vq31rounding) >> 31);
-    const int32_t vq31product112 = (int32_t)(uint32_t)((uint64_t)(vproduct112 + vq31rounding) >> 31);
-    const int32_t vq31product113 = (int32_t)(uint32_t)((uint64_t)(vproduct113 + vq31rounding) >> 31);
-
-    const int32_t vremainder00 = (vq31product00 & vremainder_mask) - (int32_t)(vq31product00 < 0);
-    const int32_t vremainder01 = (vq31product01 & vremainder_mask) - (int32_t)(vq31product01 < 0);
-    const int32_t vremainder02 = (vq31product02 & vremainder_mask) - (int32_t)(vq31product02 < 0);
-    const int32_t vremainder03 = (vq31product03 & vremainder_mask) - (int32_t)(vq31product03 < 0);
-    const int32_t vremainder10 = (vq31product10 & vremainder_mask) - (int32_t)(vq31product10 < 0);
-    const int32_t vremainder11 = (vq31product11 & vremainder_mask) - (int32_t)(vq31product11 < 0);
-    const int32_t vremainder12 = (vq31product12 & vremainder_mask) - (int32_t)(vq31product12 < 0);
-    const int32_t vremainder13 = (vq31product13 & vremainder_mask) - (int32_t)(vq31product13 < 0);
-    const int32_t vremainder20 = (vq31product20 & vremainder_mask) - (int32_t)(vq31product20 < 0);
-    const int32_t vremainder21 = (vq31product21 & vremainder_mask) - (int32_t)(vq31product21 < 0);
-    const int32_t vremainder22 = (vq31product22 & vremainder_mask) - (int32_t)(vq31product22 < 0);
-    const int32_t vremainder23 = (vq31product23 & vremainder_mask) - (int32_t)(vq31product23 < 0);
-    const int32_t vremainder30 = (vq31product30 & vremainder_mask) - (int32_t)(vq31product30 < 0);
-    const int32_t vremainder31 = (vq31product31 & vremainder_mask) - (int32_t)(vq31product31 < 0);
-    const int32_t vremainder32 = (vq31product32 & vremainder_mask) - (int32_t)(vq31product32 < 0);
-    const int32_t vremainder33 = (vq31product33 & vremainder_mask) - (int32_t)(vq31product33 < 0);
-    const int32_t vremainder40 = (vq31product40 & vremainder_mask) - (int32_t)(vq31product40 < 0);
-    const int32_t vremainder41 = (vq31product41 & vremainder_mask) - (int32_t)(vq31product41 < 0);
-    const int32_t vremainder42 = (vq31product42 & vremainder_mask) - (int32_t)(vq31product42 < 0);
-    const int32_t vremainder43 = (vq31product43 & vremainder_mask) - (int32_t)(vq31product43 < 0);
-    const int32_t vremainder50 = (vq31product50 & vremainder_mask) - (int32_t)(vq31product50 < 0);
-    const int32_t vremainder51 = (vq31product51 & vremainder_mask) - (int32_t)(vq31product51 < 0);
-    const int32_t vremainder52 = (vq31product52 & vremainder_mask) - (int32_t)(vq31product52 < 0);
-    const int32_t vremainder53 = (vq31product53 & vremainder_mask) - (int32_t)(vq31product53 < 0);
-    const int32_t vremainder60 = (vq31product60 & vremainder_mask) - (int32_t)(vq31product60 < 0);
-    const int32_t vremainder61 = (vq31product61 & vremainder_mask) - (int32_t)(vq31product61 < 0);
-    const int32_t vremainder62 = (vq31product62 & vremainder_mask) - (int32_t)(vq31product62 < 0);
-    const int32_t vremainder63 = (vq31product63 & vremainder_mask) - (int32_t)(vq31product63 < 0);
-    const int32_t vremainder70 = (vq31product70 & vremainder_mask) - (int32_t)(vq31product70 < 0);
-    const int32_t vremainder71 = (vq31product71 & vremainder_mask) - (int32_t)(vq31product71 < 0);
-    const int32_t vremainder72 = (vq31product72 & vremainder_mask) - (int32_t)(vq31product72 < 0);
-    const int32_t vremainder73 = (vq31product73 & vremainder_mask) - (int32_t)(vq31product73 < 0);
-    const int32_t vremainder80 = (vq31product80 & vremainder_mask) - (int32_t)(vq31product80 < 0);
-    const int32_t vremainder81 = (vq31product81 & vremainder_mask) - (int32_t)(vq31product81 < 0);
-    const int32_t vremainder82 = (vq31product82 & vremainder_mask) - (int32_t)(vq31product82 < 0);
-    const int32_t vremainder83 = (vq31product83 & vremainder_mask) - (int32_t)(vq31product83 < 0);
-    const int32_t vremainder90 = (vq31product90 & vremainder_mask) - (int32_t)(vq31product90 < 0);
-    const int32_t vremainder91 = (vq31product91 & vremainder_mask) - (int32_t)(vq31product91 < 0);
-    const int32_t vremainder92 = (vq31product92 & vremainder_mask) - (int32_t)(vq31product92 < 0);
-    const int32_t vremainder93 = (vq31product93 & vremainder_mask) - (int32_t)(vq31product93 < 0);
-    const int32_t vremainder100 = (vq31product100 & vremainder_mask) - (int32_t)(vq31product100 < 0);
-    const int32_t vremainder101 = (vq31product101 & vremainder_mask) - (int32_t)(vq31product101 < 0);
-    const int32_t vremainder102 = (vq31product102 & vremainder_mask) - (int32_t)(vq31product102 < 0);
-    const int32_t vremainder103 = (vq31product103 & vremainder_mask) - (int32_t)(vq31product103 < 0);
-    const int32_t vremainder110 = (vq31product110 & vremainder_mask) - (int32_t)(vq31product110 < 0);
-    const int32_t vremainder111 = (vq31product111 & vremainder_mask) - (int32_t)(vq31product111 < 0);
-    const int32_t vremainder112 = (vq31product112 & vremainder_mask) - (int32_t)(vq31product112 < 0);
-    const int32_t vremainder113 = (vq31product113 & vremainder_mask) - (int32_t)(vq31product113 < 0);
-
-    int32_t vout00 = asr_s32(vq31product00, vshift) + (int32_t)(vremainder00 > vremainder_threshold);
-    int32_t vout01 = asr_s32(vq31product01, vshift) + (int32_t)(vremainder01 > vremainder_threshold);
-    int32_t vout02 = asr_s32(vq31product02, vshift) + (int32_t)(vremainder02 > vremainder_threshold);
-    int32_t vout03 = asr_s32(vq31product03, vshift) + (int32_t)(vremainder03 > vremainder_threshold);
-    int32_t vout10 = asr_s32(vq31product10, vshift) + (int32_t)(vremainder10 > vremainder_threshold);
-    int32_t vout11 = asr_s32(vq31product11, vshift) + (int32_t)(vremainder11 > vremainder_threshold);
-    int32_t vout12 = asr_s32(vq31product12, vshift) + (int32_t)(vremainder12 > vremainder_threshold);
-    int32_t vout13 = asr_s32(vq31product13, vshift) + (int32_t)(vremainder13 > vremainder_threshold);
-    int32_t vout20 = asr_s32(vq31product20, vshift) + (int32_t)(vremainder20 > vremainder_threshold);
-    int32_t vout21 = asr_s32(vq31product21, vshift) + (int32_t)(vremainder21 > vremainder_threshold);
-    int32_t vout22 = asr_s32(vq31product22, vshift) + (int32_t)(vremainder22 > vremainder_threshold);
-    int32_t vout23 = asr_s32(vq31product23, vshift) + (int32_t)(vremainder23 > vremainder_threshold);
-    int32_t vout30 = asr_s32(vq31product30, vshift) + (int32_t)(vremainder30 > vremainder_threshold);
-    int32_t vout31 = asr_s32(vq31product31, vshift) + (int32_t)(vremainder31 > vremainder_threshold);
-    int32_t vout32 = asr_s32(vq31product32, vshift) + (int32_t)(vremainder32 > vremainder_threshold);
-    int32_t vout33 = asr_s32(vq31product33, vshift) + (int32_t)(vremainder33 > vremainder_threshold);
-    int32_t vout40 = asr_s32(vq31product40, vshift) + (int32_t)(vremainder40 > vremainder_threshold);
-    int32_t vout41 = asr_s32(vq31product41, vshift) + (int32_t)(vremainder41 > vremainder_threshold);
-    int32_t vout42 = asr_s32(vq31product42, vshift) + (int32_t)(vremainder42 > vremainder_threshold);
-    int32_t vout43 = asr_s32(vq31product43, vshift) + (int32_t)(vremainder43 > vremainder_threshold);
-    int32_t vout50 = asr_s32(vq31product50, vshift) + (int32_t)(vremainder50 > vremainder_threshold);
-    int32_t vout51 = asr_s32(vq31product51, vshift) + (int32_t)(vremainder51 > vremainder_threshold);
-    int32_t vout52 = asr_s32(vq31product52, vshift) + (int32_t)(vremainder52 > vremainder_threshold);
-    int32_t vout53 = asr_s32(vq31product53, vshift) + (int32_t)(vremainder53 > vremainder_threshold);
-    int32_t vout60 = asr_s32(vq31product60, vshift) + (int32_t)(vremainder60 > vremainder_threshold);
-    int32_t vout61 = asr_s32(vq31product61, vshift) + (int32_t)(vremainder61 > vremainder_threshold);
-    int32_t vout62 = asr_s32(vq31product62, vshift) + (int32_t)(vremainder62 > vremainder_threshold);
-    int32_t vout63 = asr_s32(vq31product63, vshift) + (int32_t)(vremainder63 > vremainder_threshold);
-    int32_t vout70 = asr_s32(vq31product70, vshift) + (int32_t)(vremainder70 > vremainder_threshold);
-    int32_t vout71 = asr_s32(vq31product71, vshift) + (int32_t)(vremainder71 > vremainder_threshold);
-    int32_t vout72 = asr_s32(vq31product72, vshift) + (int32_t)(vremainder72 > vremainder_threshold);
-    int32_t vout73 = asr_s32(vq31product73, vshift) + (int32_t)(vremainder73 > vremainder_threshold);
-    int32_t vout80 = asr_s32(vq31product80, vshift) + (int32_t)(vremainder80 > vremainder_threshold);
-    int32_t vout81 = asr_s32(vq31product81, vshift) + (int32_t)(vremainder81 > vremainder_threshold);
-    int32_t vout82 = asr_s32(vq31product82, vshift) + (int32_t)(vremainder82 > vremainder_threshold);
-    int32_t vout83 = asr_s32(vq31product83, vshift) + (int32_t)(vremainder83 > vremainder_threshold);
-    int32_t vout90 = asr_s32(vq31product90, vshift) + (int32_t)(vremainder90 > vremainder_threshold);
-    int32_t vout91 = asr_s32(vq31product91, vshift) + (int32_t)(vremainder91 > vremainder_threshold);
-    int32_t vout92 = asr_s32(vq31product92, vshift) + (int32_t)(vremainder92 > vremainder_threshold);
-    int32_t vout93 = asr_s32(vq31product93, vshift) + (int32_t)(vremainder93 > vremainder_threshold);
-    int32_t vout100 = asr_s32(vq31product100, vshift) + (int32_t)(vremainder100 > vremainder_threshold);
-    int32_t vout101 = asr_s32(vq31product101, vshift) + (int32_t)(vremainder101 > vremainder_threshold);
-    int32_t vout102 = asr_s32(vq31product102, vshift) + (int32_t)(vremainder102 > vremainder_threshold);
-    int32_t vout103 = asr_s32(vq31product103, vshift) + (int32_t)(vremainder103 > vremainder_threshold);
-    int32_t vout110 = asr_s32(vq31product110, vshift) + (int32_t)(vremainder110 > vremainder_threshold);
-    int32_t vout111 = asr_s32(vq31product111, vshift) + (int32_t)(vremainder111 > vremainder_threshold);
-    int32_t vout112 = asr_s32(vq31product112, vshift) + (int32_t)(vremainder112 > vremainder_threshold);
-    int32_t vout113 = asr_s32(vq31product113, vshift) + (int32_t)(vremainder113 > vremainder_threshold);
-
-    vout00 = vout00 < voutput_min ? voutput_min : vout00;
-    vout01 = vout01 < voutput_min ? voutput_min : vout01;
-    vout02 = vout02 < voutput_min ? voutput_min : vout02;
-    vout03 = vout03 < voutput_min ? voutput_min : vout03;
-    vout10 = vout10 < voutput_min ? voutput_min : vout10;
-    vout11 = vout11 < voutput_min ? voutput_min : vout11;
-    vout12 = vout12 < voutput_min ? voutput_min : vout12;
-    vout13 = vout13 < voutput_min ? voutput_min : vout13;
-    vout20 = vout20 < voutput_min ? voutput_min : vout20;
-    vout21 = vout21 < voutput_min ? voutput_min : vout21;
-    vout22 = vout22 < voutput_min ? voutput_min : vout22;
-    vout23 = vout23 < voutput_min ? voutput_min : vout23;
-    vout30 = vout30 < voutput_min ? voutput_min : vout30;
-    vout31 = vout31 < voutput_min ? voutput_min : vout31;
-    vout32 = vout32 < voutput_min ? voutput_min : vout32;
-    vout33 = vout33 < voutput_min ? voutput_min : vout33;
-    vout40 = vout40 < voutput_min ? voutput_min : vout40;
-    vout41 = vout41 < voutput_min ? voutput_min : vout41;
-    vout42 = vout42 < voutput_min ? voutput_min : vout42;
-    vout43 = vout43 < voutput_min ? voutput_min : vout43;
-    vout50 = vout50 < voutput_min ? voutput_min : vout50;
-    vout51 = vout51 < voutput_min ? voutput_min : vout51;
-    vout52 = vout52 < voutput_min ? voutput_min : vout52;
-    vout53 = vout53 < voutput_min ? voutput_min : vout53;
-    vout60 = vout60 < voutput_min ? voutput_min : vout60;
-    vout61 = vout61 < voutput_min ? voutput_min : vout61;
-    vout62 = vout62 < voutput_min ? voutput_min : vout62;
-    vout63 = vout63 < voutput_min ? voutput_min : vout63;
-    vout70 = vout70 < voutput_min ? voutput_min : vout70;
-    vout71 = vout71 < voutput_min ? voutput_min : vout71;
-    vout72 = vout72 < voutput_min ? voutput_min : vout72;
-    vout73 = vout73 < voutput_min ? voutput_min : vout73;
-    vout80 = vout80 < voutput_min ? voutput_min : vout80;
-    vout81 = vout81 < voutput_min ? voutput_min : vout81;
-    vout82 = vout82 < voutput_min ? voutput_min : vout82;
-    vout83 = vout83 < voutput_min ? voutput_min : vout83;
-    vout90 = vout90 < voutput_min ? voutput_min : vout90;
-    vout91 = vout91 < voutput_min ? voutput_min : vout91;
-    vout92 = vout92 < voutput_min ? voutput_min : vout92;
-    vout93 = vout93 < voutput_min ? voutput_min : vout93;
-    vout100 = vout100 < voutput_min ? voutput_min : vout100;
-    vout101 = vout101 < voutput_min ? voutput_min : vout101;
-    vout102 = vout102 < voutput_min ? voutput_min : vout102;
-    vout103 = vout103 < voutput_min ? voutput_min : vout103;
-    vout110 = vout110 < voutput_min ? voutput_min : vout110;
-    vout111 = vout111 < voutput_min ? voutput_min : vout111;
-    vout112 = vout112 < voutput_min ? voutput_min : vout112;
-    vout113 = vout113 < voutput_min ? voutput_min : vout113;
-
-    vout00 = vout00 > voutput_max ? voutput_max : vout00;
-    vout01 = vout01 > voutput_max ? voutput_max : vout01;
-    vout02 = vout02 > voutput_max ? voutput_max : vout02;
-    vout03 = vout03 > voutput_max ? voutput_max : vout03;
-    vout10 = vout10 > voutput_max ? voutput_max : vout10;
-    vout11 = vout11 > voutput_max ? voutput_max : vout11;
-    vout12 = vout12 > voutput_max ? voutput_max : vout12;
-    vout13 = vout13 > voutput_max ? voutput_max : vout13;
-    vout20 = vout20 > voutput_max ? voutput_max : vout20;
-    vout21 = vout21 > voutput_max ? voutput_max : vout21;
-    vout22 = vout22 > voutput_max ? voutput_max : vout22;
-    vout23 = vout23 > voutput_max ? voutput_max : vout23;
-    vout30 = vout30 > voutput_max ? voutput_max : vout30;
-    vout31 = vout31 > voutput_max ? voutput_max : vout31;
-    vout32 = vout32 > voutput_max ? voutput_max : vout32;
-    vout33 = vout33 > voutput_max ? voutput_max : vout33;
-    vout40 = vout40 > voutput_max ? voutput_max : vout40;
-    vout41 = vout41 > voutput_max ? voutput_max : vout41;
-    vout42 = vout42 > voutput_max ? voutput_max : vout42;
-    vout43 = vout43 > voutput_max ? voutput_max : vout43;
-    vout50 = vout50 > voutput_max ? voutput_max : vout50;
-    vout51 = vout51 > voutput_max ? voutput_max : vout51;
-    vout52 = vout52 > voutput_max ? voutput_max : vout52;
-    vout53 = vout53 > voutput_max ? voutput_max : vout53;
-    vout60 = vout60 > voutput_max ? voutput_max : vout60;
-    vout61 = vout61 > voutput_max ? voutput_max : vout61;
-    vout62 = vout62 > voutput_max ? voutput_max : vout62;
-    vout63 = vout63 > voutput_max ? voutput_max : vout63;
-    vout70 = vout70 > voutput_max ? voutput_max : vout70;
-    vout71 = vout71 > voutput_max ? voutput_max : vout71;
-    vout72 = vout72 > voutput_max ? voutput_max : vout72;
-    vout73 = vout73 > voutput_max ? voutput_max : vout73;
-    vout80 = vout80 > voutput_max ? voutput_max : vout80;
-    vout81 = vout81 > voutput_max ? voutput_max : vout81;
-    vout82 = vout82 > voutput_max ? voutput_max : vout82;
-    vout83 = vout83 > voutput_max ? voutput_max : vout83;
-    vout90 = vout90 > voutput_max ? voutput_max : vout90;
-    vout91 = vout91 > voutput_max ? voutput_max : vout91;
-    vout92 = vout92 > voutput_max ? voutput_max : vout92;
-    vout93 = vout93 > voutput_max ? voutput_max : vout93;
-    vout100 = vout100 > voutput_max ? voutput_max : vout100;
-    vout101 = vout101 > voutput_max ? voutput_max : vout101;
-    vout102 = vout102 > voutput_max ? voutput_max : vout102;
-    vout103 = vout103 > voutput_max ? voutput_max : vout103;
-    vout110 = vout110 > voutput_max ? voutput_max : vout110;
-    vout111 = vout111 > voutput_max ? voutput_max : vout111;
-    vout112 = vout112 > voutput_max ? voutput_max : vout112;
-    vout113 = vout113 > voutput_max ? voutput_max : vout113;
-
-    vout00 += voutput_zero_point;
-    vout01 += voutput_zero_point;
-    vout02 += voutput_zero_point;
-    vout03 += voutput_zero_point;
-    vout10 += voutput_zero_point;
-    vout11 += voutput_zero_point;
-    vout12 += voutput_zero_point;
-    vout13 += voutput_zero_point;
-    vout20 += voutput_zero_point;
-    vout21 += voutput_zero_point;
-    vout22 += voutput_zero_point;
-    vout23 += voutput_zero_point;
-    vout30 += voutput_zero_point;
-    vout31 += voutput_zero_point;
-    vout32 += voutput_zero_point;
-    vout33 += voutput_zero_point;
-    vout40 += voutput_zero_point;
-    vout41 += voutput_zero_point;
-    vout42 += voutput_zero_point;
-    vout43 += voutput_zero_point;
-    vout50 += voutput_zero_point;
-    vout51 += voutput_zero_point;
-    vout52 += voutput_zero_point;
-    vout53 += voutput_zero_point;
-    vout60 += voutput_zero_point;
-    vout61 += voutput_zero_point;
-    vout62 += voutput_zero_point;
-    vout63 += voutput_zero_point;
-    vout70 += voutput_zero_point;
-    vout71 += voutput_zero_point;
-    vout72 += voutput_zero_point;
-    vout73 += voutput_zero_point;
-    vout80 += voutput_zero_point;
-    vout81 += voutput_zero_point;
-    vout82 += voutput_zero_point;
-    vout83 += voutput_zero_point;
-    vout90 += voutput_zero_point;
-    vout91 += voutput_zero_point;
-    vout92 += voutput_zero_point;
-    vout93 += voutput_zero_point;
-    vout100 += voutput_zero_point;
-    vout101 += voutput_zero_point;
-    vout102 += voutput_zero_point;
-    vout103 += voutput_zero_point;
-    vout110 += voutput_zero_point;
-    vout111 += voutput_zero_point;
-    vout112 += voutput_zero_point;
-    vout113 += voutput_zero_point;
-
-    if XNN_LIKELY (nc >= 4) {
-      // Main case where there the 4 columns fit in the destination.
-      c0[0] = (int8_t) vout00;
-      c0[1] = (int8_t) vout01;
-      c0[2] = (int8_t) vout02;
-      c0[3] = (int8_t) vout03;
-      c1[0] = (int8_t) vout10;
-      c1[1] = (int8_t) vout11;
-      c1[2] = (int8_t) vout12;
-      c1[3] = (int8_t) vout13;
-      c2[0] = (int8_t) vout20;
-      c2[1] = (int8_t) vout21;
-      c2[2] = (int8_t) vout22;
-      c2[3] = (int8_t) vout23;
-      c3[0] = (int8_t) vout30;
-      c3[1] = (int8_t) vout31;
-      c3[2] = (int8_t) vout32;
-      c3[3] = (int8_t) vout33;
-      c4[0] = (int8_t) vout40;
-      c4[1] = (int8_t) vout41;
-      c4[2] = (int8_t) vout42;
-      c4[3] = (int8_t) vout43;
-      c5[0] = (int8_t) vout50;
-      c5[1] = (int8_t) vout51;
-      c5[2] = (int8_t) vout52;
-      c5[3] = (int8_t) vout53;
-      c6[0] = (int8_t) vout60;
-      c6[1] = (int8_t) vout61;
-      c6[2] = (int8_t) vout62;
-      c6[3] = (int8_t) vout63;
-      c7[0] = (int8_t) vout70;
-      c7[1] = (int8_t) vout71;
-      c7[2] = (int8_t) vout72;
-      c7[3] = (int8_t) vout73;
-      c8[0] = (int8_t) vout80;
-      c8[1] = (int8_t) vout81;
-      c8[2] = (int8_t) vout82;
-      c8[3] = (int8_t) vout83;
-      c9[0] = (int8_t) vout90;
-      c9[1] = (int8_t) vout91;
-      c9[2] = (int8_t) vout92;
-      c9[3] = (int8_t) vout93;
-      c10[0] = (int8_t) vout100;
-      c10[1] = (int8_t) vout101;
-      c10[2] = (int8_t) vout102;
-      c10[3] = (int8_t) vout103;
-      c11[0] = (int8_t) vout110;
-      c11[1] = (int8_t) vout111;
-      c11[2] = (int8_t) vout112;
-      c11[3] = (int8_t) vout113;
-
-      // Advance to the next 4 columns.
-      c0 = (int8_t*)((uintptr_t)c0 + cn_stride);
-      c1 = (int8_t*)((uintptr_t)c1 + cn_stride);
-      c2 = (int8_t*)((uintptr_t)c2 + cn_stride);
-      c3 = (int8_t*)((uintptr_t)c3 + cn_stride);
-      c4 = (int8_t*)((uintptr_t)c4 + cn_stride);
-      c5 = (int8_t*)((uintptr_t)c5 + cn_stride);
-      c6 = (int8_t*)((uintptr_t)c6 + cn_stride);
-      c7 = (int8_t*)((uintptr_t)c7 + cn_stride);
-      c8 = (int8_t*)((uintptr_t)c8 + cn_stride);
-      c9 = (int8_t*)((uintptr_t)c9 + cn_stride);
-      c10 = (int8_t*)((uintptr_t)c10 + cn_stride);
-      c11 = (int8_t*)((uintptr_t)c11 + cn_stride);
-
-      nc -= 4;
-    } else {
-      // Final case where not all of the 4 columns fit in the destination.
-      if (nc > 0) {
-        c0[0] = vout00;
-        c1[0] = vout10;
-        c2[0] = vout20;
-        c3[0] = vout30;
-        c4[0] = vout40;
-        c5[0] = vout50;
-        c6[0] = vout60;
-        c7[0] = vout70;
-        c8[0] = vout80;
-        c9[0] = vout90;
-        c10[0] = vout100;
-        c11[0] = vout110;
-      }
-      if (nc > 1) {
-        c0[1] = vout01;
-        c1[1] = vout11;
-        c2[1] = vout21;
-        c3[1] = vout31;
-        c4[1] = vout41;
-        c5[1] = vout51;
-        c6[1] = vout61;
-        c7[1] = vout71;
-        c8[1] = vout81;
-        c9[1] = vout91;
-        c10[1] = vout101;
-        c11[1] = vout111;
-      }
-      if (nc > 2) {
-        c0[2] = vout02;
-        c1[2] = vout12;
-        c2[2] = vout22;
-        c3[2] = vout32;
-        c4[2] = vout42;
-        c5[2] = vout52;
-        c6[2] = vout62;
-        c7[2] = vout72;
-        c8[2] = vout82;
-        c9[2] = vout92;
-        c10[2] = vout102;
-        c11[2] = vout112;
-      }
-      if (nc > 3) {
-        c0[3] = vout03;
-        c1[3] = vout13;
-        c2[3] = vout23;
-        c3[3] = vout33;
-        c4[3] = vout43;
-        c5[3] = vout53;
-        c6[3] = vout63;
-        c7[3] = vout73;
-        c8[3] = vout83;
-        c9[3] = vout93;
-        c10[3] = vout103;
-        c11[3] = vout113;
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/qs8-gemm/gen/12x8c4-minmax-neondot.c b/src/qs8-gemm/gen/12x8c4-minmax-neondot.c
deleted file mode 100644
index a693476..0000000
--- a/src/qs8-gemm/gen/12x8c4-minmax-neondot.c
+++ /dev/null
@@ -1,534 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/qs8-gemm/MRxNRc4-neondot.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include <xnnpack/gemm.h>
-
-
-// This kernel uses ARMv8.2 dot-product instructions.
-//
-// Scalar model: xnn_qs8_gemm_minmax_ukernel_12x8c4__scalar. Refer to
-// that kernel for more comments.
-void xnn_qs8_gemm_minmax_ukernel_12x8c4__neondot(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    const int8_t* restrict a,
-    size_t a_stride,
-    const void* restrict w,
-    int8_t* restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN {
-  assert(mr != 0);
-  assert(mr <= 12);
-  assert(nc != 0);
-  assert(kc != 0);
-
-  const int8_t* a0 = a;
-  int8_t* c0 = c;
-  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
-  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 2) {
-    a1 = a0;
-    c1 = c0;
-  }
-  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
-  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 2) {
-    a2 = a1;
-    c2 = c1;
-  }
-  const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride);
-  int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 4) {
-    a3 = a2;
-    c3 = c2;
-  }
-  const int8_t* a4 = (const int8_t*) ((uintptr_t) a3 + a_stride);
-  int8_t* c4 = (int8_t*) ((uintptr_t) c3 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 4) {
-    a4 = a3;
-    c4 = c3;
-  }
-  const int8_t* a5 = (const int8_t*) ((uintptr_t) a4 + a_stride);
-  int8_t* c5 = (int8_t*) ((uintptr_t) c4 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 6) {
-    a5 = a4;
-    c5 = c4;
-  }
-  const int8_t* a6 = (const int8_t*) ((uintptr_t) a5 + a_stride);
-  int8_t* c6 = (int8_t*) ((uintptr_t) c5 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 6) {
-    a6 = a5;
-    c6 = c5;
-  }
-  const int8_t* a7 = (const int8_t*) ((uintptr_t) a6 + a_stride);
-  int8_t* c7 = (int8_t*) ((uintptr_t) c6 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 8) {
-    a7 = a6;
-    c7 = c6;
-  }
-  const int8_t* a8 = (const int8_t*) ((uintptr_t) a7 + a_stride);
-  int8_t* c8 = (int8_t*) ((uintptr_t) c7 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 8) {
-    a8 = a7;
-    c8 = c7;
-  }
-  const int8_t* a9 = (const int8_t*) ((uintptr_t) a8 + a_stride);
-  int8_t* c9 = (int8_t*) ((uintptr_t) c8 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 10) {
-    a9 = a8;
-    c9 = c8;
-  }
-  const int8_t* a10 = (const int8_t*) ((uintptr_t) a9 + a_stride);
-  int8_t* c10 = (int8_t*) ((uintptr_t) c9 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 10) {
-    a10 = a9;
-    c10 = c9;
-  }
-  const int8_t* a11 = (const int8_t*) ((uintptr_t) a10 + a_stride);
-  int8_t* c11 = (int8_t*) ((uintptr_t) c10 + cm_stride);
-  if XNN_UNPREDICTABLE(mr != 12) {
-    a11 = a10;
-    c11 = c10;
-  }
-
-  // Loop over groups of 8 columns.
-  do {
-    // Initialize accumulators with bias. 8 bias values are loaded from the
-    // weight matrix, at the start of the group of 8 columns.
-    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
-    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
-    int32x4_t vacc1x0123 = vacc0x0123;
-    int32x4_t vacc1x4567 = vacc0x4567;
-    int32x4_t vacc2x0123 = vacc0x0123;
-    int32x4_t vacc2x4567 = vacc0x4567;
-    int32x4_t vacc3x0123 = vacc0x0123;
-    int32x4_t vacc3x4567 = vacc0x4567;
-    int32x4_t vacc4x0123 = vacc0x0123;
-    int32x4_t vacc4x4567 = vacc0x4567;
-    int32x4_t vacc5x0123 = vacc0x0123;
-    int32x4_t vacc5x4567 = vacc0x4567;
-    int32x4_t vacc6x0123 = vacc0x0123;
-    int32x4_t vacc6x4567 = vacc0x4567;
-    int32x4_t vacc7x0123 = vacc0x0123;
-    int32x4_t vacc7x4567 = vacc0x4567;
-    int32x4_t vacc8x0123 = vacc0x0123;
-    int32x4_t vacc8x4567 = vacc0x4567;
-    int32x4_t vacc9x0123 = vacc0x0123;
-    int32x4_t vacc9x4567 = vacc0x4567;
-    int32x4_t vacc10x0123 = vacc0x0123;
-    int32x4_t vacc10x4567 = vacc0x4567;
-    int32x4_t vacc11x0123 = vacc0x0123;
-    int32x4_t vacc11x4567 = vacc0x4567;
-
-    // Inner accumulation loop along the 8 columns.
-    size_t k = kc;
-    // 2x partial unrolled loop to load 8 bytes at a time.
-    while (k >= 8 * sizeof(int8_t)) {
-      // Load a 12x8 block of activations.
-      const int8x8_t va0x01234567 = vld1_s8(a0); a0 += 8;
-      const int8x8_t va1x01234567 = vld1_s8(a1); a1 += 8;
-      const int8x8_t va2x01234567 = vld1_s8(a2); a2 += 8;
-      const int8x8_t va3x01234567 = vld1_s8(a3); a3 += 8;
-      const int8x8_t va4x01234567 = vld1_s8(a4); a4 += 8;
-      const int8x8_t va5x01234567 = vld1_s8(a5); a5 += 8;
-      const int8x8_t va6x01234567 = vld1_s8(a6); a6 += 8;
-      const int8x8_t va7x01234567 = vld1_s8(a7); a7 += 8;
-      const int8x8_t va8x01234567 = vld1_s8(a8); a8 += 8;
-      const int8x8_t va9x01234567 = vld1_s8(a9); a9 += 8;
-      const int8x8_t va10x01234567 = vld1_s8(a10); a10 += 8;
-      const int8x8_t va11x01234567 = vld1_s8(a11); a11 += 8;
-
-      // Load a 8x8 block of weights.
-      const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-
-      // Multiply-accumulate: 12x8 * 8x8 --> 12x8.
-      vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb0123x0123, va0x01234567, 0);
-      vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb0123x4567, va0x01234567, 0);
-      vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb0123x0123, va1x01234567, 0);
-      vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb0123x4567, va1x01234567, 0);
-      vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb0123x0123, va2x01234567, 0);
-      vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb0123x4567, va2x01234567, 0);
-      vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb0123x0123, va3x01234567, 0);
-      vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb0123x4567, va3x01234567, 0);
-      vacc4x0123 = vdotq_lane_s32(vacc4x0123, vb0123x0123, va4x01234567, 0);
-      vacc4x4567 = vdotq_lane_s32(vacc4x4567, vb0123x4567, va4x01234567, 0);
-      vacc5x0123 = vdotq_lane_s32(vacc5x0123, vb0123x0123, va5x01234567, 0);
-      vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb0123x4567, va5x01234567, 0);
-      vacc6x0123 = vdotq_lane_s32(vacc6x0123, vb0123x0123, va6x01234567, 0);
-      vacc6x4567 = vdotq_lane_s32(vacc6x4567, vb0123x4567, va6x01234567, 0);
-      vacc7x0123 = vdotq_lane_s32(vacc7x0123, vb0123x0123, va7x01234567, 0);
-      vacc7x4567 = vdotq_lane_s32(vacc7x4567, vb0123x4567, va7x01234567, 0);
-      vacc8x0123 = vdotq_lane_s32(vacc8x0123, vb0123x0123, va8x01234567, 0);
-      vacc8x4567 = vdotq_lane_s32(vacc8x4567, vb0123x4567, va8x01234567, 0);
-      vacc9x0123 = vdotq_lane_s32(vacc9x0123, vb0123x0123, va9x01234567, 0);
-      vacc9x4567 = vdotq_lane_s32(vacc9x4567, vb0123x4567, va9x01234567, 0);
-      vacc10x0123 = vdotq_lane_s32(vacc10x0123, vb0123x0123, va10x01234567, 0);
-      vacc10x4567 = vdotq_lane_s32(vacc10x4567, vb0123x4567, va10x01234567, 0);
-      vacc11x0123 = vdotq_lane_s32(vacc11x0123, vb0123x0123, va11x01234567, 0);
-      vacc11x4567 = vdotq_lane_s32(vacc11x4567, vb0123x4567, va11x01234567, 0);
-      vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
-      vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
-      vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1);
-      vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1);
-      vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1);
-      vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1);
-      vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb4567x0123, va3x01234567, 1);
-      vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb4567x4567, va3x01234567, 1);
-      vacc4x0123 = vdotq_lane_s32(vacc4x0123, vb4567x0123, va4x01234567, 1);
-      vacc4x4567 = vdotq_lane_s32(vacc4x4567, vb4567x4567, va4x01234567, 1);
-      vacc5x0123 = vdotq_lane_s32(vacc5x0123, vb4567x0123, va5x01234567, 1);
-      vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb4567x4567, va5x01234567, 1);
-      vacc6x0123 = vdotq_lane_s32(vacc6x0123, vb4567x0123, va6x01234567, 1);
-      vacc6x4567 = vdotq_lane_s32(vacc6x4567, vb4567x4567, va6x01234567, 1);
-      vacc7x0123 = vdotq_lane_s32(vacc7x0123, vb4567x0123, va7x01234567, 1);
-      vacc7x4567 = vdotq_lane_s32(vacc7x4567, vb4567x4567, va7x01234567, 1);
-      vacc8x0123 = vdotq_lane_s32(vacc8x0123, vb4567x0123, va8x01234567, 1);
-      vacc8x4567 = vdotq_lane_s32(vacc8x4567, vb4567x4567, va8x01234567, 1);
-      vacc9x0123 = vdotq_lane_s32(vacc9x0123, vb4567x0123, va9x01234567, 1);
-      vacc9x4567 = vdotq_lane_s32(vacc9x4567, vb4567x4567, va9x01234567, 1);
-      vacc10x0123 = vdotq_lane_s32(vacc10x0123, vb4567x0123, va10x01234567, 1);
-      vacc10x4567 = vdotq_lane_s32(vacc10x4567, vb4567x4567, va10x01234567, 1);
-      vacc11x0123 = vdotq_lane_s32(vacc11x0123, vb4567x0123, va11x01234567, 1);
-      vacc11x4567 = vdotq_lane_s32(vacc11x4567, vb4567x4567, va11x01234567, 1);
-
-      k -= 8 * sizeof(int8_t);
-    }
-    // Handle up to 7 final positions of `k`
-    if XNN_UNLIKELY(k != 0) {
-      // Load a 12x4 block of activations.
-      const int8x8_t va0x01234567 = vld1_s8(a0); a0 += k;
-      const int8x8_t va1x01234567 = vld1_s8(a1); a1 += k;
-      const int8x8_t va2x01234567 = vld1_s8(a2); a2 += k;
-      const int8x8_t va3x01234567 = vld1_s8(a3); a3 += k;
-      const int8x8_t va4x01234567 = vld1_s8(a4); a4 += k;
-      const int8x8_t va5x01234567 = vld1_s8(a5); a5 += k;
-      const int8x8_t va6x01234567 = vld1_s8(a6); a6 += k;
-      const int8x8_t va7x01234567 = vld1_s8(a7); a7 += k;
-      const int8x8_t va8x01234567 = vld1_s8(a8); a8 += k;
-      const int8x8_t va9x01234567 = vld1_s8(a9); a9 += k;
-      const int8x8_t va10x01234567 = vld1_s8(a10); a10 += k;
-      const int8x8_t va11x01234567 = vld1_s8(a11); a11 += k;
-
-      // Load a 4x8 block of weights.
-      const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-
-      // Multiply-accumulate: 12x4 * 4x8 --> 12x8.
-      vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb0123x0123, va0x01234567, 0);
-      vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb0123x4567, va0x01234567, 0);
-      vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb0123x0123, va1x01234567, 0);
-      vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb0123x4567, va1x01234567, 0);
-      vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb0123x0123, va2x01234567, 0);
-      vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb0123x4567, va2x01234567, 0);
-      vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb0123x0123, va3x01234567, 0);
-      vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb0123x4567, va3x01234567, 0);
-      vacc4x0123 = vdotq_lane_s32(vacc4x0123, vb0123x0123, va4x01234567, 0);
-      vacc4x4567 = vdotq_lane_s32(vacc4x4567, vb0123x4567, va4x01234567, 0);
-      vacc5x0123 = vdotq_lane_s32(vacc5x0123, vb0123x0123, va5x01234567, 0);
-      vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb0123x4567, va5x01234567, 0);
-      vacc6x0123 = vdotq_lane_s32(vacc6x0123, vb0123x0123, va6x01234567, 0);
-      vacc6x4567 = vdotq_lane_s32(vacc6x4567, vb0123x4567, va6x01234567, 0);
-      vacc7x0123 = vdotq_lane_s32(vacc7x0123, vb0123x0123, va7x01234567, 0);
-      vacc7x4567 = vdotq_lane_s32(vacc7x4567, vb0123x4567, va7x01234567, 0);
-      vacc8x0123 = vdotq_lane_s32(vacc8x0123, vb0123x0123, va8x01234567, 0);
-      vacc8x4567 = vdotq_lane_s32(vacc8x4567, vb0123x4567, va8x01234567, 0);
-      vacc9x0123 = vdotq_lane_s32(vacc9x0123, vb0123x0123, va9x01234567, 0);
-      vacc9x4567 = vdotq_lane_s32(vacc9x4567, vb0123x4567, va9x01234567, 0);
-      vacc10x0123 = vdotq_lane_s32(vacc10x0123, vb0123x0123, va10x01234567, 0);
-      vacc10x4567 = vdotq_lane_s32(vacc10x4567, vb0123x4567, va10x01234567, 0);
-      vacc11x0123 = vdotq_lane_s32(vacc11x0123, vb0123x0123, va11x01234567, 0);
-      vacc11x4567 = vdotq_lane_s32(vacc11x4567, vb0123x4567, va11x01234567, 0);
-
-      if (k > 4) {
-        // Load a 4x8 block of weights.
-        const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-
-        // Multiply-accumulate: 12x4 * 4x8 --> 12x8.
-        vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
-        vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
-        vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1);
-        vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1);
-        vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1);
-        vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1);
-        vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb4567x0123, va3x01234567, 1);
-        vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb4567x4567, va3x01234567, 1);
-        vacc4x0123 = vdotq_lane_s32(vacc4x0123, vb4567x0123, va4x01234567, 1);
-        vacc4x4567 = vdotq_lane_s32(vacc4x4567, vb4567x4567, va4x01234567, 1);
-        vacc5x0123 = vdotq_lane_s32(vacc5x0123, vb4567x0123, va5x01234567, 1);
-        vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb4567x4567, va5x01234567, 1);
-        vacc6x0123 = vdotq_lane_s32(vacc6x0123, vb4567x0123, va6x01234567, 1);
-        vacc6x4567 = vdotq_lane_s32(vacc6x4567, vb4567x4567, va6x01234567, 1);
-        vacc7x0123 = vdotq_lane_s32(vacc7x0123, vb4567x0123, va7x01234567, 1);
-        vacc7x4567 = vdotq_lane_s32(vacc7x4567, vb4567x4567, va7x01234567, 1);
-        vacc8x0123 = vdotq_lane_s32(vacc8x0123, vb4567x0123, va8x01234567, 1);
-        vacc8x4567 = vdotq_lane_s32(vacc8x4567, vb4567x4567, va8x01234567, 1);
-        vacc9x0123 = vdotq_lane_s32(vacc9x0123, vb4567x0123, va9x01234567, 1);
-        vacc9x4567 = vdotq_lane_s32(vacc9x4567, vb4567x4567, va9x01234567, 1);
-        vacc10x0123 = vdotq_lane_s32(vacc10x0123, vb4567x0123, va10x01234567, 1);
-        vacc10x4567 = vdotq_lane_s32(vacc10x4567, vb4567x4567, va10x01234567, 1);
-        vacc11x0123 = vdotq_lane_s32(vacc11x0123, vb4567x0123, va11x01234567, 1);
-        vacc11x4567 = vdotq_lane_s32(vacc11x4567, vb4567x4567, va11x01234567, 1);
-      }
-    }
-    // End of accumulation loop. The variable `kc` contains the amount by which
-    // we advanced the `va` pointers, so we rewind by this amount now.
-    a0 = (const int8_t*)((uintptr_t)a0 - kc);
-    a1 = (const int8_t*)((uintptr_t)a1 - kc);
-    a2 = (const int8_t*)((uintptr_t)a2 - kc);
-    a3 = (const int8_t*)((uintptr_t)a3 - kc);
-    a4 = (const int8_t*)((uintptr_t)a4 - kc);
-    a5 = (const int8_t*)((uintptr_t)a5 - kc);
-    a6 = (const int8_t*)((uintptr_t)a6 - kc);
-    a7 = (const int8_t*)((uintptr_t)a7 - kc);
-    a8 = (const int8_t*)((uintptr_t)a8 - kc);
-    a9 = (const int8_t*)((uintptr_t)a9 - kc);
-    a10 = (const int8_t*)((uintptr_t)a10 - kc);
-    a11 = (const int8_t*)((uintptr_t)a11 - kc);
-
-    // Post-accumulation work
-
-    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
-    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
-
-    const int32x4_t vproduct0x0123 = vqrdmulhq_n_s32(vacc0x0123, params->neon.multiplier);
-    const int32x4_t vproduct0x4567 = vqrdmulhq_n_s32(vacc0x4567, params->neon.multiplier);
-    const int32x4_t vproduct1x0123 = vqrdmulhq_n_s32(vacc1x0123, params->neon.multiplier);
-    const int32x4_t vproduct1x4567 = vqrdmulhq_n_s32(vacc1x4567, params->neon.multiplier);
-    const int32x4_t vproduct2x0123 = vqrdmulhq_n_s32(vacc2x0123, params->neon.multiplier);
-    const int32x4_t vproduct2x4567 = vqrdmulhq_n_s32(vacc2x4567, params->neon.multiplier);
-    const int32x4_t vproduct3x0123 = vqrdmulhq_n_s32(vacc3x0123, params->neon.multiplier);
-    const int32x4_t vproduct3x4567 = vqrdmulhq_n_s32(vacc3x4567, params->neon.multiplier);
-    const int32x4_t vproduct4x0123 = vqrdmulhq_n_s32(vacc4x0123, params->neon.multiplier);
-    const int32x4_t vproduct4x4567 = vqrdmulhq_n_s32(vacc4x4567, params->neon.multiplier);
-    const int32x4_t vproduct5x0123 = vqrdmulhq_n_s32(vacc5x0123, params->neon.multiplier);
-    const int32x4_t vproduct5x4567 = vqrdmulhq_n_s32(vacc5x4567, params->neon.multiplier);
-    const int32x4_t vproduct6x0123 = vqrdmulhq_n_s32(vacc6x0123, params->neon.multiplier);
-    const int32x4_t vproduct6x4567 = vqrdmulhq_n_s32(vacc6x4567, params->neon.multiplier);
-    const int32x4_t vproduct7x0123 = vqrdmulhq_n_s32(vacc7x0123, params->neon.multiplier);
-    const int32x4_t vproduct7x4567 = vqrdmulhq_n_s32(vacc7x4567, params->neon.multiplier);
-    const int32x4_t vproduct8x0123 = vqrdmulhq_n_s32(vacc8x0123, params->neon.multiplier);
-    const int32x4_t vproduct8x4567 = vqrdmulhq_n_s32(vacc8x4567, params->neon.multiplier);
-    const int32x4_t vproduct9x0123 = vqrdmulhq_n_s32(vacc9x0123, params->neon.multiplier);
-    const int32x4_t vproduct9x4567 = vqrdmulhq_n_s32(vacc9x4567, params->neon.multiplier);
-    const int32x4_t vproduct10x0123 = vqrdmulhq_n_s32(vacc10x0123, params->neon.multiplier);
-    const int32x4_t vproduct10x4567 = vqrdmulhq_n_s32(vacc10x4567, params->neon.multiplier);
-    const int32x4_t vproduct11x0123 = vqrdmulhq_n_s32(vacc11x0123, params->neon.multiplier);
-    const int32x4_t vproduct11x4567 = vqrdmulhq_n_s32(vacc11x4567, params->neon.multiplier);
-
-    vacc0x0123 = vsraq_n_s32(vproduct0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
-    vacc0x4567 = vsraq_n_s32(vproduct0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
-    vacc1x0123 = vsraq_n_s32(vproduct1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
-    vacc1x4567 = vsraq_n_s32(vproduct1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
-    vacc2x0123 = vsraq_n_s32(vproduct2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
-    vacc2x4567 = vsraq_n_s32(vproduct2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
-    vacc3x0123 = vsraq_n_s32(vproduct3x0123, vbicq_s32(vacc3x0123, vzero_shift_mask), 31);
-    vacc3x4567 = vsraq_n_s32(vproduct3x4567, vbicq_s32(vacc3x4567, vzero_shift_mask), 31);
-    vacc4x0123 = vsraq_n_s32(vproduct4x0123, vbicq_s32(vacc4x0123, vzero_shift_mask), 31);
-    vacc4x4567 = vsraq_n_s32(vproduct4x4567, vbicq_s32(vacc4x4567, vzero_shift_mask), 31);
-    vacc5x0123 = vsraq_n_s32(vproduct5x0123, vbicq_s32(vacc5x0123, vzero_shift_mask), 31);
-    vacc5x4567 = vsraq_n_s32(vproduct5x4567, vbicq_s32(vacc5x4567, vzero_shift_mask), 31);
-    vacc6x0123 = vsraq_n_s32(vproduct6x0123, vbicq_s32(vacc6x0123, vzero_shift_mask), 31);
-    vacc6x4567 = vsraq_n_s32(vproduct6x4567, vbicq_s32(vacc6x4567, vzero_shift_mask), 31);
-    vacc7x0123 = vsraq_n_s32(vproduct7x0123, vbicq_s32(vacc7x0123, vzero_shift_mask), 31);
-    vacc7x4567 = vsraq_n_s32(vproduct7x4567, vbicq_s32(vacc7x4567, vzero_shift_mask), 31);
-    vacc8x0123 = vsraq_n_s32(vproduct8x0123, vbicq_s32(vacc8x0123, vzero_shift_mask), 31);
-    vacc8x4567 = vsraq_n_s32(vproduct8x4567, vbicq_s32(vacc8x4567, vzero_shift_mask), 31);
-    vacc9x0123 = vsraq_n_s32(vproduct9x0123, vbicq_s32(vacc9x0123, vzero_shift_mask), 31);
-    vacc9x4567 = vsraq_n_s32(vproduct9x4567, vbicq_s32(vacc9x4567, vzero_shift_mask), 31);
-    vacc10x0123 = vsraq_n_s32(vproduct10x0123, vbicq_s32(vacc10x0123, vzero_shift_mask), 31);
-    vacc10x4567 = vsraq_n_s32(vproduct10x4567, vbicq_s32(vacc10x4567, vzero_shift_mask), 31);
-    vacc11x0123 = vsraq_n_s32(vproduct11x0123, vbicq_s32(vacc11x0123, vzero_shift_mask), 31);
-    vacc11x4567 = vsraq_n_s32(vproduct11x4567, vbicq_s32(vacc11x4567, vzero_shift_mask), 31);
-
-    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
-    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
-    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
-    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
-    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
-    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
-    vacc3x0123 = vrshlq_s32(vacc3x0123, vright_shift);
-    vacc3x4567 = vrshlq_s32(vacc3x4567, vright_shift);
-    vacc4x0123 = vrshlq_s32(vacc4x0123, vright_shift);
-    vacc4x4567 = vrshlq_s32(vacc4x4567, vright_shift);
-    vacc5x0123 = vrshlq_s32(vacc5x0123, vright_shift);
-    vacc5x4567 = vrshlq_s32(vacc5x4567, vright_shift);
-    vacc6x0123 = vrshlq_s32(vacc6x0123, vright_shift);
-    vacc6x4567 = vrshlq_s32(vacc6x4567, vright_shift);
-    vacc7x0123 = vrshlq_s32(vacc7x0123, vright_shift);
-    vacc7x4567 = vrshlq_s32(vacc7x4567, vright_shift);
-    vacc8x0123 = vrshlq_s32(vacc8x0123, vright_shift);
-    vacc8x4567 = vrshlq_s32(vacc8x4567, vright_shift);
-    vacc9x0123 = vrshlq_s32(vacc9x0123, vright_shift);
-    vacc9x4567 = vrshlq_s32(vacc9x4567, vright_shift);
-    vacc10x0123 = vrshlq_s32(vacc10x0123, vright_shift);
-    vacc10x4567 = vrshlq_s32(vacc10x4567, vright_shift);
-    vacc11x0123 = vrshlq_s32(vacc11x0123, vright_shift);
-    vacc11x4567 = vrshlq_s32(vacc11x4567, vright_shift);
-
-    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
-#if XNN_ARCH_ARM64
-    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
-    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
-    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
-    const int16x8_t vacc3x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
-    const int16x8_t vacc4x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc4x0123), vacc4x4567), voutput_zero_point);
-    const int16x8_t vacc5x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc5x0123), vacc5x4567), voutput_zero_point);
-    const int16x8_t vacc6x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc6x0123), vacc6x4567), voutput_zero_point);
-    const int16x8_t vacc7x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc7x0123), vacc7x4567), voutput_zero_point);
-    const int16x8_t vacc8x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc8x0123), vacc8x4567), voutput_zero_point);
-    const int16x8_t vacc9x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc9x0123), vacc9x4567), voutput_zero_point);
-    const int16x8_t vacc10x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc10x0123), vacc10x4567), voutput_zero_point);
-    const int16x8_t vacc11x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc11x0123), vacc11x4567), voutput_zero_point);
-
-    int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
-    int8x16_t vout2x01234567_3x01234567 = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc3x01234567);
-    int8x16_t vout4x01234567_5x01234567 = vqmovn_high_s16(vqmovn_s16(vacc4x01234567), vacc5x01234567);
-    int8x16_t vout6x01234567_7x01234567 = vqmovn_high_s16(vqmovn_s16(vacc6x01234567), vacc7x01234567);
-    int8x16_t vout8x01234567_9x01234567 = vqmovn_high_s16(vqmovn_s16(vacc8x01234567), vacc9x01234567);
-    int8x16_t vout10x01234567_11x01234567 = vqmovn_high_s16(vqmovn_s16(vacc10x01234567), vacc11x01234567);
-#else
-    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
-    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
-    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
-    const int16x8_t vacc3x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567)), voutput_zero_point);
-    const int16x8_t vacc4x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc4x0123), vqmovn_s32(vacc4x4567)), voutput_zero_point);
-    const int16x8_t vacc5x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc5x0123), vqmovn_s32(vacc5x4567)), voutput_zero_point);
-    const int16x8_t vacc6x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc6x0123), vqmovn_s32(vacc6x4567)), voutput_zero_point);
-    const int16x8_t vacc7x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc7x0123), vqmovn_s32(vacc7x4567)), voutput_zero_point);
-    const int16x8_t vacc8x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc8x0123), vqmovn_s32(vacc8x4567)), voutput_zero_point);
-    const int16x8_t vacc9x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc9x0123), vqmovn_s32(vacc9x4567)), voutput_zero_point);
-    const int16x8_t vacc10x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc10x0123), vqmovn_s32(vacc10x4567)), voutput_zero_point);
-    const int16x8_t vacc11x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc11x0123), vqmovn_s32(vacc11x4567)), voutput_zero_point);
-
-    int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
-    int8x16_t vout2x01234567_3x01234567 = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc3x01234567));
-    int8x16_t vout4x01234567_5x01234567 = vcombine_s8(vqmovn_s16(vacc4x01234567), vqmovn_s16(vacc5x01234567));
-    int8x16_t vout6x01234567_7x01234567 = vcombine_s8(vqmovn_s16(vacc6x01234567), vqmovn_s16(vacc7x01234567));
-    int8x16_t vout8x01234567_9x01234567 = vcombine_s8(vqmovn_s16(vacc8x01234567), vqmovn_s16(vacc9x01234567));
-    int8x16_t vout10x01234567_11x01234567 = vcombine_s8(vqmovn_s16(vacc10x01234567), vqmovn_s16(vacc11x01234567));
-#endif
-    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
-    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
-
-    vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);
-    vout2x01234567_3x01234567 = vmaxq_s8(vout2x01234567_3x01234567, voutput_min);
-    vout4x01234567_5x01234567 = vmaxq_s8(vout4x01234567_5x01234567, voutput_min);
-    vout6x01234567_7x01234567 = vmaxq_s8(vout6x01234567_7x01234567, voutput_min);
-    vout8x01234567_9x01234567 = vmaxq_s8(vout8x01234567_9x01234567, voutput_min);
-    vout10x01234567_11x01234567 = vmaxq_s8(vout10x01234567_11x01234567, voutput_min);
-
-    vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);
-    vout2x01234567_3x01234567 = vminq_s8(vout2x01234567_3x01234567, voutput_max);
-    vout4x01234567_5x01234567 = vminq_s8(vout4x01234567_5x01234567, voutput_max);
-    vout6x01234567_7x01234567 = vminq_s8(vout6x01234567_7x01234567, voutput_max);
-    vout8x01234567_9x01234567 = vminq_s8(vout8x01234567_9x01234567, voutput_max);
-    vout10x01234567_11x01234567 = vminq_s8(vout10x01234567_11x01234567, voutput_max);
-
-    if (nc >= 8) {
-      // Main case where there the 8 columns fit in the destination.
-      vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
-      vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
-      vst1_s8(c2 + 0, vget_low_s8(vout2x01234567_3x01234567));
-      vst1_s8(c3 + 0, vget_high_s8(vout2x01234567_3x01234567));
-      vst1_s8(c4 + 0, vget_low_s8(vout4x01234567_5x01234567));
-      vst1_s8(c5 + 0, vget_high_s8(vout4x01234567_5x01234567));
-      vst1_s8(c6 + 0, vget_low_s8(vout6x01234567_7x01234567));
-      vst1_s8(c7 + 0, vget_high_s8(vout6x01234567_7x01234567));
-      vst1_s8(c8 + 0, vget_low_s8(vout8x01234567_9x01234567));
-      vst1_s8(c9 + 0, vget_high_s8(vout8x01234567_9x01234567));
-      vst1_s8(c10 + 0, vget_low_s8(vout10x01234567_11x01234567));
-      vst1_s8(c11 + 0, vget_high_s8(vout10x01234567_11x01234567));
-
-      // Advance to the next 8 columns.
-      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
-      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
-      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
-      c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
-      c4 = (int8_t*) ((uintptr_t) c4 + cn_stride);
-      c5 = (int8_t*) ((uintptr_t) c5 + cn_stride);
-      c6 = (int8_t*) ((uintptr_t) c6 + cn_stride);
-      c7 = (int8_t*) ((uintptr_t) c7 + cn_stride);
-      c8 = (int8_t*) ((uintptr_t) c8 + cn_stride);
-      c9 = (int8_t*) ((uintptr_t) c9 + cn_stride);
-      c10 = (int8_t*) ((uintptr_t) c10 + cn_stride);
-      c11 = (int8_t*) ((uintptr_t) c11 + cn_stride);
-
-      nc -= 8;
-    } else {
-      // Final case where not all of the 8 columns fit in the destination.
-      if (nc & 4) {
-        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
-        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
-        vst1q_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 0); c2 += 4;
-        vst1q_lane_u32(__builtin_assume_aligned(c3, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 2); c3 += 4;
-        vst1q_lane_u32(__builtin_assume_aligned(c4, 1), vreinterpretq_u32_s8(vout4x01234567_5x01234567), 0); c4 += 4;
-        vst1q_lane_u32(__builtin_assume_aligned(c5, 1), vreinterpretq_u32_s8(vout4x01234567_5x01234567), 2); c5 += 4;
-        vst1q_lane_u32(__builtin_assume_aligned(c6, 1), vreinterpretq_u32_s8(vout6x01234567_7x01234567), 0); c6 += 4;
-        vst1q_lane_u32(__builtin_assume_aligned(c7, 1), vreinterpretq_u32_s8(vout6x01234567_7x01234567), 2); c7 += 4;
-        vst1q_lane_u32(__builtin_assume_aligned(c8, 1), vreinterpretq_u32_s8(vout8x01234567_9x01234567), 0); c8 += 4;
-        vst1q_lane_u32(__builtin_assume_aligned(c9, 1), vreinterpretq_u32_s8(vout8x01234567_9x01234567), 2); c9 += 4;
-        vst1q_lane_u32(__builtin_assume_aligned(c10, 1), vreinterpretq_u32_s8(vout10x01234567_11x01234567), 0); c10 += 4;
-        vst1q_lane_u32(__builtin_assume_aligned(c11, 1), vreinterpretq_u32_s8(vout10x01234567_11x01234567), 2); c11 += 4;
-        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
-        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
-        vout4x01234567_5x01234567 = vextq_s8(vout4x01234567_5x01234567, vout4x01234567_5x01234567, 4);
-        vout6x01234567_7x01234567 = vextq_s8(vout6x01234567_7x01234567, vout6x01234567_7x01234567, 4);
-        vout8x01234567_9x01234567 = vextq_s8(vout8x01234567_9x01234567, vout8x01234567_9x01234567, 4);
-        vout10x01234567_11x01234567 = vextq_s8(vout10x01234567_11x01234567, vout10x01234567_11x01234567, 4);
-      }
-      if (nc & 2) {
-        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
-        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
-        vst1q_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 0); c2 += 2;
-        vst1q_lane_u16(__builtin_assume_aligned(c3, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 4); c3 += 2;
-        vst1q_lane_u16(__builtin_assume_aligned(c4, 1), vreinterpretq_u16_s8(vout4x01234567_5x01234567), 0); c4 += 2;
-        vst1q_lane_u16(__builtin_assume_aligned(c5, 1), vreinterpretq_u16_s8(vout4x01234567_5x01234567), 4); c5 += 2;
-        vst1q_lane_u16(__builtin_assume_aligned(c6, 1), vreinterpretq_u16_s8(vout6x01234567_7x01234567), 0); c6 += 2;
-        vst1q_lane_u16(__builtin_assume_aligned(c7, 1), vreinterpretq_u16_s8(vout6x01234567_7x01234567), 4); c7 += 2;
-        vst1q_lane_u16(__builtin_assume_aligned(c8, 1), vreinterpretq_u16_s8(vout8x01234567_9x01234567), 0); c8 += 2;
-        vst1q_lane_u16(__builtin_assume_aligned(c9, 1), vreinterpretq_u16_s8(vout8x01234567_9x01234567), 4); c9 += 2;
-        vst1q_lane_u16(__builtin_assume_aligned(c10, 1), vreinterpretq_u16_s8(vout10x01234567_11x01234567), 0); c10 += 2;
-        vst1q_lane_u16(__builtin_assume_aligned(c11, 1), vreinterpretq_u16_s8(vout10x01234567_11x01234567), 4); c11 += 2;
-        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
-        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
-        vout4x01234567_5x01234567 = vextq_s8(vout4x01234567_5x01234567, vout4x01234567_5x01234567, 2);
-        vout6x01234567_7x01234567 = vextq_s8(vout6x01234567_7x01234567, vout6x01234567_7x01234567, 2);
-        vout8x01234567_9x01234567 = vextq_s8(vout8x01234567_9x01234567, vout8x01234567_9x01234567, 2);
-        vout10x01234567_11x01234567 = vextq_s8(vout10x01234567_11x01234567, vout10x01234567_11x01234567, 2);
-      }
-      if (nc & 1) {
-        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
-        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
-        vst1q_lane_s8(c2, vout2x01234567_3x01234567, 0);
-        vst1q_lane_s8(c3, vout2x01234567_3x01234567, 8);
-        vst1q_lane_s8(c4, vout4x01234567_5x01234567, 0);
-        vst1q_lane_s8(c5, vout4x01234567_5x01234567, 8);
-        vst1q_lane_s8(c6, vout6x01234567_7x01234567, 0);
-        vst1q_lane_s8(c7, vout6x01234567_7x01234567, 8);
-        vst1q_lane_s8(c8, vout8x01234567_9x01234567, 0);
-        vst1q_lane_s8(c9, vout8x01234567_9x01234567, 8);
-        vst1q_lane_s8(c10, vout10x01234567_11x01234567, 0);
-        vst1q_lane_s8(c11, vout10x01234567_11x01234567, 8);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/qs8-gemm/gen/1x16-minmax-neon-mull-addw-dup.c b/src/qs8-gemm/gen/1x16-minmax-neon-mull-addw-dup.c
new file mode 100644
index 0000000..0d1e159
--- /dev/null
+++ b/src/qs8-gemm/gen/1x16-minmax-neon-mull-addw-dup.c
@@ -0,0 +1,286 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/neon-mull-addw-dup.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/gemm.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = kc;
+    while (k >= 8 * sizeof(int8_t)) {
+      const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+
+      const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c0));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c0));
+      const int8x8_t vb89ABCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va0, 0));
+      vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc0));
+      vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc0));
+      const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c1));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c1));
+      const int8x8_t vb89ABCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va0, 1));
+      vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc1));
+      vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc1));
+      const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va0, 2));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c2));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c2));
+      const int8x8_t vb89ABCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va0, 2));
+      vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc2));
+      vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc2));
+      const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3));
+      const int8x8_t vb89ABCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va0, 3));
+      vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc3));
+      vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc3));
+      const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va0, 4));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c4));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c4));
+      const int8x8_t vb89ABCDEFc4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va0, 4));
+      vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc4));
+      vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc4));
+      const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va0, 5));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c5));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c5));
+      const int8x8_t vb89ABCDEFc5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va0, 5));
+      vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc5));
+      vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc5));
+      const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va0, 6));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c6));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c6));
+      const int8x8_t vb89ABCDEFc6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va0, 6));
+      vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc6));
+      vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc6));
+      const int8x8_t vb01234567c7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va0, 7));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c7));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c7));
+      const int8x8_t vb89ABCDEFc7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x89ABCDEFc7 = vmull_s8(vb89ABCDEFc7, vdup_lane_s8(va0, 7));
+      vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc7));
+      vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc7));
+
+      k -= 8 * sizeof(int8_t);
+    }
+    if XNN_UNLIKELY(k != 0) {
+      const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+
+      const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c0));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c0));
+      const int16x8_t vprod0x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va0, 0));
+      vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc0));
+      vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc0));
+
+      if (k >= 2 * sizeof(int8_t)) {
+        const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c1));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c1));
+        const int16x8_t vprod0x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va0, 1));
+        vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc1));
+        vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc1));
+
+        if (k > 2 * sizeof(int8_t)) {
+          const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb89ABCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          const int16x8_t vprod0x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va0, 2));
+          vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c2));
+          vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c2));
+          const int16x8_t vprod0x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va0, 2));
+          vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc2));
+          vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc2));
+
+          if (k >= 4 * sizeof(int8_t)) {
+            const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            const int8x8_t vb89ABCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+            const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3));
+            vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3));
+            vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3));
+            const int16x8_t vprod0x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va0, 3));
+            vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc3));
+            vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc3));
+
+            if (k > 4 * sizeof(int8_t)) {
+              const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+              const int8x8_t vb89ABCDEFc4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+              const int16x8_t vprod0x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va0, 4));
+              vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c4));
+              vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c4));
+              const int16x8_t vprod0x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va0, 4));
+              vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc4));
+              vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc4));
+
+              if (k >= 6 * sizeof(int8_t)) {
+                const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+                const int8x8_t vb89ABCDEFc5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+                const int16x8_t vprod0x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va0, 5));
+                vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c5));
+                vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c5));
+                const int16x8_t vprod0x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va0, 5));
+                vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc5));
+                vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc5));
+
+                if (k > 6 * sizeof(int8_t)) {
+                  const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+                  const int8x8_t vb89ABCDEFc6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+                  const int16x8_t vprod0x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va0, 6));
+                  vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c6));
+                  vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c6));
+                  const int16x8_t vprod0x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va0, 6));
+                  vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc6));
+                  vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc6));
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+    vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+    vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+
+    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+
+    if (nc >= 16) {
+      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
+      nc -= 16;
+    } else {
+      int8x8_t vout0x01234567 = vget_low_s8(vout0x0123456789ABCDEF);
+      if (nc & 8) {
+        vst1_s8(c0, vout0x01234567); c0 += 8;
+        vout0x01234567 = vget_high_s8(vout0x0123456789ABCDEF);
+      }
+      if (nc & 4) {
+        vst1_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpret_u32_s8(vout0x01234567), 0); c0 += 4;
+        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpret_u16_s8(vout0x01234567), 0); c0 += 2;
+        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1_lane_s8(c0, vout0x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/1x16c16-minmax-neon-mlal-padal.c b/src/qs8-gemm/gen/1x16c16-minmax-neon-mlal-padal.c
new file mode 100644
index 0000000..ce7d5d9
--- /dev/null
+++ b/src/qs8-gemm/gen/1x16c16-minmax-neon-mlal-padal.c
@@ -0,0 +1,244 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/c16-neon-mlal-padal.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 16);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+
+  do {
+    int32x4_t vacc0x0 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x1 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x2 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x3 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x4 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x5 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x8 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x9 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x10 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x11 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x12 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x13 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x14 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x15 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+
+    // KC loop of 16
+    size_t k = 0;
+    while (k < kc) {
+      const int8x16_t va0 = vld1q_s8(a0); a0 += 16;
+
+      const int8x16_t vb0 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb1 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb2 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb3 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb4 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb5 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb6 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb7 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb8 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb9 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb10 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb11 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb12 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb13 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb14 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb15 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+
+      int16x8_t vprod0x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va0));
+      vprod0x0 = vmlal_s8(vprod0x0, vget_high_s8(vb0), vget_high_s8(va0));
+      vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+      int16x8_t vprod0x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va0));
+      vprod0x1 = vmlal_s8(vprod0x1, vget_high_s8(vb1), vget_high_s8(va0));
+      vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+      int16x8_t vprod0x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va0));
+      vprod0x2 = vmlal_s8(vprod0x2, vget_high_s8(vb2), vget_high_s8(va0));
+      vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+      int16x8_t vprod0x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va0));
+      vprod0x3 = vmlal_s8(vprod0x3, vget_high_s8(vb3), vget_high_s8(va0));
+      vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+      int16x8_t vprod0x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va0));
+      vprod0x4 = vmlal_s8(vprod0x4, vget_high_s8(vb4), vget_high_s8(va0));
+      vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+      int16x8_t vprod0x5 = vmull_s8(vget_low_s8(vb5), vget_low_s8(va0));
+      vprod0x5 = vmlal_s8(vprod0x5, vget_high_s8(vb5), vget_high_s8(va0));
+      vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+      int16x8_t vprod0x6 = vmull_s8(vget_low_s8(vb6), vget_low_s8(va0));
+      vprod0x6 = vmlal_s8(vprod0x6, vget_high_s8(vb6), vget_high_s8(va0));
+      vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+      int16x8_t vprod0x7 = vmull_s8(vget_low_s8(vb7), vget_low_s8(va0));
+      vprod0x7 = vmlal_s8(vprod0x7, vget_high_s8(vb7), vget_high_s8(va0));
+      vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+      int16x8_t vprod0x8 = vmull_s8(vget_low_s8(vb8), vget_low_s8(va0));
+      vprod0x8 = vmlal_s8(vprod0x8, vget_high_s8(vb8), vget_high_s8(va0));
+      vacc0x8 = vpadalq_s16(vacc0x8, vprod0x8);
+      int16x8_t vprod0x9 = vmull_s8(vget_low_s8(vb9), vget_low_s8(va0));
+      vprod0x9 = vmlal_s8(vprod0x9, vget_high_s8(vb9), vget_high_s8(va0));
+      vacc0x9 = vpadalq_s16(vacc0x9, vprod0x9);
+      int16x8_t vprod0x10 = vmull_s8(vget_low_s8(vb10), vget_low_s8(va0));
+      vprod0x10 = vmlal_s8(vprod0x10, vget_high_s8(vb10), vget_high_s8(va0));
+      vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10);
+      int16x8_t vprod0x11 = vmull_s8(vget_low_s8(vb11), vget_low_s8(va0));
+      vprod0x11 = vmlal_s8(vprod0x11, vget_high_s8(vb11), vget_high_s8(va0));
+      vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11);
+      int16x8_t vprod0x12 = vmull_s8(vget_low_s8(vb12), vget_low_s8(va0));
+      vprod0x12 = vmlal_s8(vprod0x12, vget_high_s8(vb12), vget_high_s8(va0));
+      vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12);
+      int16x8_t vprod0x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va0));
+      vprod0x13 = vmlal_s8(vprod0x13, vget_high_s8(vb13), vget_high_s8(va0));
+      vacc0x13 = vpadalq_s16(vacc0x13, vprod0x13);
+      int16x8_t vprod0x14 = vmull_s8(vget_low_s8(vb14), vget_low_s8(va0));
+      vprod0x14 = vmlal_s8(vprod0x14, vget_high_s8(vb14), vget_high_s8(va0));
+      vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14);
+      int16x8_t vprod0x15 = vmull_s8(vget_low_s8(vb15), vget_low_s8(va0));
+      vprod0x15 = vmlal_s8(vprod0x15, vget_high_s8(vb15), vget_high_s8(va0));
+      vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15);
+
+      k += 16 * sizeof(int8_t);
+    }
+
+#if XNN_ARCH_ARM64
+    const int32x4_t vsum0x01 = vpaddq_s32(vacc0x0, vacc0x1);
+    const int32x4_t vsum0x23 = vpaddq_s32(vacc0x2, vacc0x3);
+    const int32x4_t vsum0x45 = vpaddq_s32(vacc0x4, vacc0x5);
+    const int32x4_t vsum0x67 = vpaddq_s32(vacc0x6, vacc0x7);
+    const int32x4_t vsum0x89 = vpaddq_s32(vacc0x8, vacc0x9);
+    const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11);
+    const int32x4_t vsum0xCD = vpaddq_s32(vacc0x12, vacc0x13);
+    const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15);
+    int32x4_t vacc0x0123 = vpaddq_s32(vsum0x01, vsum0x23);
+    int32x4_t vacc0x4567 = vpaddq_s32(vsum0x45, vsum0x67);
+    int32x4_t vacc0x89AB = vpaddq_s32(vsum0x89, vsum0xAB);
+    int32x4_t vacc0xCDEF = vpaddq_s32(vsum0xCD, vsum0xEF);
+#else
+    const int32x2_t vpsum0x0 = vadd_s32(vget_low_s32(vacc0x0), vget_high_s32(vacc0x0));
+    const int32x2_t vpsum0x1 = vadd_s32(vget_low_s32(vacc0x1), vget_high_s32(vacc0x1));
+    const int32x2_t vpsum0x2 = vadd_s32(vget_low_s32(vacc0x2), vget_high_s32(vacc0x2));
+    const int32x2_t vpsum0x3 = vadd_s32(vget_low_s32(vacc0x3), vget_high_s32(vacc0x3));
+    const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1);
+    const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3);
+    int32x4_t vacc0x0123 = vcombine_s32(vsum0x01, vsum0x23 );
+    const int32x2_t vpsum0x4 = vadd_s32(vget_low_s32(vacc0x4), vget_high_s32(vacc0x4));
+    const int32x2_t vpsum0x5 = vadd_s32(vget_low_s32(vacc0x5), vget_high_s32(vacc0x5));
+    const int32x2_t vpsum0x6 = vadd_s32(vget_low_s32(vacc0x6), vget_high_s32(vacc0x6));
+    const int32x2_t vpsum0x7 = vadd_s32(vget_low_s32(vacc0x7), vget_high_s32(vacc0x7));
+    const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5);
+    const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7);
+    int32x4_t vacc0x4567 = vcombine_s32(vsum0x45, vsum0x67 );
+    const int32x2_t vpsum0x8 = vadd_s32(vget_low_s32(vacc0x8), vget_high_s32(vacc0x8));
+    const int32x2_t vpsum0x9 = vadd_s32(vget_low_s32(vacc0x9), vget_high_s32(vacc0x9));
+    const int32x2_t vpsum0xA = vadd_s32(vget_low_s32(vacc0x10), vget_high_s32(vacc0x10));
+    const int32x2_t vpsum0xB = vadd_s32(vget_low_s32(vacc0x11), vget_high_s32(vacc0x11));
+    const int32x2_t vsum0x89 = vpadd_s32(vpsum0x8, vpsum0x9);
+    const int32x2_t vsum0xAB = vpadd_s32(vpsum0xA, vpsum0xB);
+    int32x4_t vacc0x89AB = vcombine_s32(vsum0x89, vsum0xAB );
+    const int32x2_t vpsum0xC = vadd_s32(vget_low_s32(vacc0x12), vget_high_s32(vacc0x12));
+    const int32x2_t vpsum0xD = vadd_s32(vget_low_s32(vacc0x13), vget_high_s32(vacc0x13));
+    const int32x2_t vpsum0xE = vadd_s32(vget_low_s32(vacc0x14), vget_high_s32(vacc0x14));
+    const int32x2_t vpsum0xF = vadd_s32(vget_low_s32(vacc0x15), vget_high_s32(vacc0x15));
+    const int32x2_t vsum0xCD = vpadd_s32(vpsum0xC, vpsum0xD);
+    const int32x2_t vsum0xEF = vpadd_s32(vpsum0xE, vpsum0xF);
+    int32x4_t vacc0xCDEF = vcombine_s32(vsum0xCD, vsum0xEF );
+#endif
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+    vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+    vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+
+    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+
+    if (nc >= 16) {
+      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
+      nc -= 16;
+    } else {
+      int8x8_t vout0x01234567 = vget_low_s8(vout0x0123456789ABCDEF);
+      if (nc & 8) {
+        vst1_s8(c0, vout0x01234567); c0 += 8;
+        vout0x01234567 = vget_high_s8(vout0x0123456789ABCDEF);
+      }
+      if (nc & 4) {
+        vst1_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpret_u32_s8(vout0x01234567), 0); c0 += 4;
+        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpret_u16_s8(vout0x01234567), 0); c0 += 2;
+        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1_lane_s8(c0, vout0x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/1x16c2-minmax-neon-mlal-padal-dup.c b/src/qs8-gemm/gen/1x16c2-minmax-neon-mlal-padal-dup.c
new file mode 100644
index 0000000..7c5f25d
--- /dev/null
+++ b/src/qs8-gemm/gen/1x16c2-minmax-neon-mlal-padal-dup.c
@@ -0,0 +1,311 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/c2-neon-mull-padal-dup.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = kc;
+
+    while (k >= 16 * sizeof(int8_t)) {
+      const int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va0x1 = vld1_s8(a0); a0 += 8;
+
+      const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+      const int8x8_t vb0123c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+      int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+      const int8x8_t vb4567c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+      int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+      const int8x8_t vb89ABc0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x89ABc0 = vmlal_s8(vprod0x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+      int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+      const int8x8_t vbCDEFc0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0xCDEFc0 = vmlal_s8(vprod0xCDEFc0, vbCDEFc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+      int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+      const int8x8_t vb0123c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x0123c1 = vmlal_s8(vprod0x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+      int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+      const int8x8_t vb4567c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x4567c1 = vmlal_s8(vprod0x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+      int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+      const int8x8_t vb89ABc1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x89ABc1 = vmlal_s8(vprod0x89ABc1, vb89ABc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+      int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+      const int8x8_t vbCDEFc1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0xCDEFc1 = vmlal_s8(vprod0xCDEFc1, vbCDEFc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+      int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+      const int8x8_t vb0123c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x0123c2 = vmlal_s8(vprod0x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+      int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+      const int8x8_t vb4567c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x4567c2 = vmlal_s8(vprod0x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+      int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+      const int8x8_t vb89ABc2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x89ABc2 = vmlal_s8(vprod0x89ABc2, vb89ABc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+      int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+      const int8x8_t vbCDEFc2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0xCDEFc2 = vmlal_s8(vprod0xCDEFc2, vbCDEFc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+      int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+      const int8x8_t vb0123c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x0123c3 = vmlal_s8(vprod0x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+      int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+      const int8x8_t vb4567c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x4567c3 = vmlal_s8(vprod0x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+      int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+      const int8x8_t vb89ABc3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x89ABc3 = vmlal_s8(vprod0x89ABc3, vb89ABc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
+      int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+      const int8x8_t vbCDEFc3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0xCDEFc3 = vmlal_s8(vprod0xCDEFc3, vbCDEFc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
+
+      k -= 16 * sizeof(int8_t);
+    }
+
+    if (k >= 8 * sizeof(int8_t)) {
+      const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+
+      const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+      const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+      const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+      const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+      const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+      const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+      const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+      const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+      const int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
+      const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+      const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+      const int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
+
+      k -= 8 * sizeof(int8_t);
+    }
+
+    if XNN_UNLIKELY(k != 0) {
+      const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+
+      const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+      const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+      const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+      const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+
+      if (k > 2 * sizeof(int8_t)) {
+        const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+        const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+        const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+        const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+
+        if (k > 4 * sizeof(int8_t)) {
+          const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+          vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+          const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+          vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+          const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+          vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+          const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+          vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+        }
+      }
+    }
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+    vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+    vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+
+    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+
+    if (nc >= 16) {
+      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
+      nc -= 16;
+    } else {
+      int8x8_t vout0x01234567 = vget_low_s8(vout0x0123456789ABCDEF);
+      if (nc & 8) {
+        vst1_s8(c0, vout0x01234567); c0 += 8;
+        vout0x01234567 = vget_high_s8(vout0x0123456789ABCDEF);
+      }
+      if (nc & 4) {
+        vst1_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpret_u32_s8(vout0x01234567), 0); c0 += 4;
+        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpret_u16_s8(vout0x01234567), 0); c0 += 2;
+        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1_lane_s8(c0, vout0x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/1x16c2-minmax-neon-mull-padal-dup.c b/src/qs8-gemm/gen/1x16c2-minmax-neon-mull-padal-dup.c
new file mode 100644
index 0000000..8447491
--- /dev/null
+++ b/src/qs8-gemm/gen/1x16c2-minmax-neon-mull-padal-dup.c
@@ -0,0 +1,223 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/c2-neon-mull-padal-dup.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = kc;
+
+
+    while (k >= 8 * sizeof(int8_t)) {
+      const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+
+      const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+      const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+      const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+      const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+      const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+      const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+      const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+      const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+      const int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
+      const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+      const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+      const int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
+
+      k -= 8 * sizeof(int8_t);
+    }
+
+    if XNN_UNLIKELY(k != 0) {
+      const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+
+      const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+      const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+      const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+      const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+
+      if (k > 2 * sizeof(int8_t)) {
+        const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+        const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+        const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+        const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+
+        if (k > 4 * sizeof(int8_t)) {
+          const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+          vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+          const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+          vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+          const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+          vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+          const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+          vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+        }
+      }
+    }
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+    vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+    vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+
+    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+
+    if (nc >= 16) {
+      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
+      nc -= 16;
+    } else {
+      int8x8_t vout0x01234567 = vget_low_s8(vout0x0123456789ABCDEF);
+      if (nc & 8) {
+        vst1_s8(c0, vout0x01234567); c0 += 8;
+        vout0x01234567 = vget_high_s8(vout0x0123456789ABCDEF);
+      }
+      if (nc & 4) {
+        vst1_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpret_u32_s8(vout0x01234567), 0); c0 += 4;
+        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpret_u16_s8(vout0x01234567), 0); c0 += 2;
+        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1_lane_s8(c0, vout0x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/1x16c4-minmax-neondot.c b/src/qs8-gemm/gen/1x16c4-minmax-neondot.c
index cd8f44f..7645374 100644
--- a/src/qs8-gemm/gen/1x16c4-minmax-neondot.c
+++ b/src/qs8-gemm/gen/1x16c4-minmax-neondot.c
@@ -7,18 +7,14 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-
 #include <assert.h>
 
 #include <arm_neon.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
-// This kernel uses ARMv8.2 dot-product instructions.
-//
-// Scalar model: xnn_qs8_gemm_minmax_ukernel_1x16c4__scalar. Refer to
-// that kernel for more comments.
 void xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot(
     size_t mr,
     size_t nc,
@@ -34,7 +30,12 @@
   assert(mr <= 1);
   assert(nc != 0);
   assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
 
+  kc = round_up_po2(kc, 4);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -55,14 +56,14 @@
       const int8x8_t va0x01234567 = vld1_s8(a0); a0 += 8;
 
       // Load a 8x16 block of weights.
-      const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb0123x89AB = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb0123xCDEF = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb4567x89AB = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb4567xCDEF = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
+      const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb0123x89AB = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb0123xCDEF = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb4567x89AB = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb4567xCDEF = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
 
       // Multiply-accumulate: 1x8 * 8x16 --> 1x16.
       vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb0123x0123, va0x01234567, 0);
@@ -76,43 +77,25 @@
 
       k -= 8 * sizeof(int8_t);
     }
-    // Handle up to 7 final positions of `k`
+    // Handle up to 4 final positions of `k`
     if XNN_UNLIKELY(k != 0) {
       // Load a 1x4 block of activations.
-      const int8x8_t va0x01234567 = vld1_s8(a0); a0 += k;
+      const int8x8_t va0x01234567 = vld1_s8(a0); a0 += 4;
 
       // Load a 4x16 block of weights.
-      const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb0123x89AB = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb0123xCDEF = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
+      const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb0123x89AB = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb0123xCDEF = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
 
       // Multiply-accumulate: 1x4 * 4x16 --> 1x16.
       vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb0123x0123, va0x01234567, 0);
       vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb0123x4567, va0x01234567, 0);
       vacc0x89AB = vdotq_lane_s32(vacc0x89AB, vb0123x89AB, va0x01234567, 0);
       vacc0xCDEF = vdotq_lane_s32(vacc0xCDEF, vb0123xCDEF, va0x01234567, 0);
-
-      if (k > 4) {
-        // Load a 4x16 block of weights.
-        const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb4567x89AB = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb4567xCDEF = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-
-        // Multiply-accumulate: 1x4 * 4x16 --> 1x16.
-        vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
-        vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
-        vacc0x89AB = vdotq_lane_s32(vacc0x89AB, vb4567x89AB, va0x01234567, 1);
-        vacc0xCDEF = vdotq_lane_s32(vacc0xCDEF, vb4567xCDEF, va0x01234567, 1);
-      }
     }
-    // End of accumulation loop. The variable `kc` contains the amount by which
-    // we advanced the `va` pointers, so we rewind by this amount now.
-    a0 = (const int8_t*)((uintptr_t)a0 - kc);
 
     // Post-accumulation work
-
     const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
     const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
 
@@ -157,6 +140,8 @@
       // Advance to the next 16 columns.
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 16;
     } else {
       // Final case where not all of the 16 columns fit in the destination.
diff --git a/src/qs8-gemm/gen/1x16c8-minmax-avx512skx.c b/src/qs8-gemm/gen/1x16c8-minmax-avx512skx.c
index e03b182..52ac712 100644
--- a/src/qs8-gemm/gen/1x16c8-minmax-avx512skx.c
+++ b/src/qs8-gemm/gen/1x16c8-minmax-avx512skx.c
@@ -13,6 +13,7 @@
 
 #include <xnnpack/gemm.h>
 #include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx(
@@ -36,6 +37,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
diff --git a/src/qs8-gemm/gen/1x16c8-minmax-neon-mlal-padal.c b/src/qs8-gemm/gen/1x16c8-minmax-neon-mlal-padal.c
new file mode 100644
index 0000000..ad8313d
--- /dev/null
+++ b/src/qs8-gemm/gen/1x16c8-minmax-neon-mlal-padal.c
@@ -0,0 +1,317 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/c8-neon-mull-padal.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+
+  do {
+    int32x4_t vacc0x0 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x1 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x2 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x3 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x4 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x5 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x8 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x9 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x10 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x11 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x12 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x13 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x14 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x15 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+
+    size_t k = kc;
+    // 2x partial unrolled loop to load 16 bytes at a time using MLA.
+    while (k >= 16 * sizeof(int8_t)) {
+      const int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va0x1 = vld1_s8(a0); a0 += 8;
+
+      const int8x8_t vb0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb4x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb5x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb6x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb7x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb8x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb9x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb10x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb11x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb12x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb13x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb14x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb15x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+
+      const int8x8_t vb0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x0 = vmull_s8(vb0x0, va0x0);
+      vprod0x0 = vmlal_s8(vprod0x0, vb0x1, va0x1);
+      vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+      const int8x8_t vb1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x1 = vmull_s8(vb1x0, va0x0);
+      vprod0x1 = vmlal_s8(vprod0x1, vb1x1, va0x1);
+      vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+      const int8x8_t vb2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x2 = vmull_s8(vb2x0, va0x0);
+      vprod0x2 = vmlal_s8(vprod0x2, vb2x1, va0x1);
+      vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+      const int8x8_t vb3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x3 = vmull_s8(vb3x0, va0x0);
+      vprod0x3 = vmlal_s8(vprod0x3, vb3x1, va0x1);
+      vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+      const int8x8_t vb4x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x4 = vmull_s8(vb4x0, va0x0);
+      vprod0x4 = vmlal_s8(vprod0x4, vb4x1, va0x1);
+      vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+      const int8x8_t vb5x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x5 = vmull_s8(vb5x0, va0x0);
+      vprod0x5 = vmlal_s8(vprod0x5, vb5x1, va0x1);
+      vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+      const int8x8_t vb6x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x6 = vmull_s8(vb6x0, va0x0);
+      vprod0x6 = vmlal_s8(vprod0x6, vb6x1, va0x1);
+      vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+      const int8x8_t vb7x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x7 = vmull_s8(vb7x0, va0x0);
+      vprod0x7 = vmlal_s8(vprod0x7, vb7x1, va0x1);
+      vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+      const int8x8_t vb8x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x8 = vmull_s8(vb8x0, va0x0);
+      vprod0x8 = vmlal_s8(vprod0x8, vb8x1, va0x1);
+      vacc0x8 = vpadalq_s16(vacc0x8, vprod0x8);
+      const int8x8_t vb9x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x9 = vmull_s8(vb9x0, va0x0);
+      vprod0x9 = vmlal_s8(vprod0x9, vb9x1, va0x1);
+      vacc0x9 = vpadalq_s16(vacc0x9, vprod0x9);
+      const int8x8_t vb10x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x10 = vmull_s8(vb10x0, va0x0);
+      vprod0x10 = vmlal_s8(vprod0x10, vb10x1, va0x1);
+      vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10);
+      const int8x8_t vb11x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x11 = vmull_s8(vb11x0, va0x0);
+      vprod0x11 = vmlal_s8(vprod0x11, vb11x1, va0x1);
+      vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11);
+      const int8x8_t vb12x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x12 = vmull_s8(vb12x0, va0x0);
+      vprod0x12 = vmlal_s8(vprod0x12, vb12x1, va0x1);
+      vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12);
+      const int8x8_t vb13x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x13 = vmull_s8(vb13x0, va0x0);
+      vprod0x13 = vmlal_s8(vprod0x13, vb13x1, va0x1);
+      vacc0x13 = vpadalq_s16(vacc0x13, vprod0x13);
+      const int8x8_t vb14x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x14 = vmull_s8(vb14x0, va0x0);
+      vprod0x14 = vmlal_s8(vprod0x14, vb14x1, va0x1);
+      vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14);
+      const int8x8_t vb15x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x15 = vmull_s8(vb15x0, va0x0);
+      vprod0x15 = vmlal_s8(vprod0x15, vb15x1, va0x1);
+      vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15);
+
+      k -= 16 * sizeof(int8_t);
+    }
+
+    // Handle 8 bytes at a time using MUL.
+    if (k > 0) {
+      const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+
+      const int8x8_t vb0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x0 = vmull_s8(vb0, va0);
+      vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+      const int8x8_t vb1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x1 = vmull_s8(vb1, va0);
+      vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+      const int8x8_t vb2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x2 = vmull_s8(vb2, va0);
+      vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+      const int8x8_t vb3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x3 = vmull_s8(vb3, va0);
+      vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+      const int8x8_t vb4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x4 = vmull_s8(vb4, va0);
+      vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+      const int8x8_t vb5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x5 = vmull_s8(vb5, va0);
+      vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+      const int8x8_t vb6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x6 = vmull_s8(vb6, va0);
+      vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+      const int8x8_t vb7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x7 = vmull_s8(vb7, va0);
+      vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+      const int8x8_t vb8 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x8 = vmull_s8(vb8, va0);
+      vacc0x8 = vpadalq_s16(vacc0x8, vprod0x8);
+      const int8x8_t vb9 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x9 = vmull_s8(vb9, va0);
+      vacc0x9 = vpadalq_s16(vacc0x9, vprod0x9);
+      const int8x8_t vb10 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x10 = vmull_s8(vb10, va0);
+      vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10);
+      const int8x8_t vb11 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x11 = vmull_s8(vb11, va0);
+      vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11);
+      const int8x8_t vb12 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x12 = vmull_s8(vb12, va0);
+      vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12);
+      const int8x8_t vb13 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x13 = vmull_s8(vb13, va0);
+      vacc0x13 = vpadalq_s16(vacc0x13, vprod0x13);
+      const int8x8_t vb14 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x14 = vmull_s8(vb14, va0);
+      vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14);
+      const int8x8_t vb15 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x15 = vmull_s8(vb15, va0);
+      vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15);
+
+      k -= 8 * sizeof(int8_t);
+    }
+
+#if XNN_ARCH_ARM64
+    const int32x4_t vsum0x01 = vpaddq_s32(vacc0x0, vacc0x1);
+    const int32x4_t vsum0x23 = vpaddq_s32(vacc0x2, vacc0x3);
+    const int32x4_t vsum0x45 = vpaddq_s32(vacc0x4, vacc0x5);
+    const int32x4_t vsum0x67 = vpaddq_s32(vacc0x6, vacc0x7);
+    const int32x4_t vsum0x89 = vpaddq_s32(vacc0x8, vacc0x9);
+    const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11);
+    const int32x4_t vsum0xCD = vpaddq_s32(vacc0x12, vacc0x13);
+    const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15);
+    int32x4_t vacc0x0123 = vpaddq_s32(vsum0x01, vsum0x23);
+    int32x4_t vacc0x4567 = vpaddq_s32(vsum0x45, vsum0x67);
+    int32x4_t vacc0x89AB = vpaddq_s32(vsum0x89, vsum0xAB);
+    int32x4_t vacc0xCDEF = vpaddq_s32(vsum0xCD, vsum0xEF);
+#else
+    const int32x2_t vpsum0x0 = vadd_s32(vget_low_s32(vacc0x0), vget_high_s32(vacc0x0));
+    const int32x2_t vpsum0x1 = vadd_s32(vget_low_s32(vacc0x1), vget_high_s32(vacc0x1));
+    const int32x2_t vpsum0x2 = vadd_s32(vget_low_s32(vacc0x2), vget_high_s32(vacc0x2));
+    const int32x2_t vpsum0x3 = vadd_s32(vget_low_s32(vacc0x3), vget_high_s32(vacc0x3));
+    const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1);
+    const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3);
+    int32x4_t vacc0x0123 = vcombine_s32(vsum0x01, vsum0x23 );
+    const int32x2_t vpsum0x4 = vadd_s32(vget_low_s32(vacc0x4), vget_high_s32(vacc0x4));
+    const int32x2_t vpsum0x5 = vadd_s32(vget_low_s32(vacc0x5), vget_high_s32(vacc0x5));
+    const int32x2_t vpsum0x6 = vadd_s32(vget_low_s32(vacc0x6), vget_high_s32(vacc0x6));
+    const int32x2_t vpsum0x7 = vadd_s32(vget_low_s32(vacc0x7), vget_high_s32(vacc0x7));
+    const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5);
+    const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7);
+    int32x4_t vacc0x4567 = vcombine_s32(vsum0x45, vsum0x67 );
+    const int32x2_t vpsum0x8 = vadd_s32(vget_low_s32(vacc0x8), vget_high_s32(vacc0x8));
+    const int32x2_t vpsum0x9 = vadd_s32(vget_low_s32(vacc0x9), vget_high_s32(vacc0x9));
+    const int32x2_t vpsum0xA = vadd_s32(vget_low_s32(vacc0x10), vget_high_s32(vacc0x10));
+    const int32x2_t vpsum0xB = vadd_s32(vget_low_s32(vacc0x11), vget_high_s32(vacc0x11));
+    const int32x2_t vsum0x89 = vpadd_s32(vpsum0x8, vpsum0x9);
+    const int32x2_t vsum0xAB = vpadd_s32(vpsum0xA, vpsum0xB);
+    int32x4_t vacc0x89AB = vcombine_s32(vsum0x89, vsum0xAB );
+    const int32x2_t vpsum0xC = vadd_s32(vget_low_s32(vacc0x12), vget_high_s32(vacc0x12));
+    const int32x2_t vpsum0xD = vadd_s32(vget_low_s32(vacc0x13), vget_high_s32(vacc0x13));
+    const int32x2_t vpsum0xE = vadd_s32(vget_low_s32(vacc0x14), vget_high_s32(vacc0x14));
+    const int32x2_t vpsum0xF = vadd_s32(vget_low_s32(vacc0x15), vget_high_s32(vacc0x15));
+    const int32x2_t vsum0xCD = vpadd_s32(vpsum0xC, vpsum0xD);
+    const int32x2_t vsum0xEF = vpadd_s32(vpsum0xE, vpsum0xF);
+    int32x4_t vacc0xCDEF = vcombine_s32(vsum0xCD, vsum0xEF );
+#endif
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+    vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+    vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+
+    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+
+    if (nc >= 16) {
+      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
+      nc -= 16;
+    } else {
+      int8x8_t vout0x01234567 = vget_low_s8(vout0x0123456789ABCDEF);
+      if (nc & 8) {
+        vst1_s8(c0, vout0x01234567); c0 += 8;
+        vout0x01234567 = vget_high_s8(vout0x0123456789ABCDEF);
+      }
+      if (nc & 4) {
+        vst1_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpret_u32_s8(vout0x01234567), 0); c0 += 4;
+        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpret_u16_s8(vout0x01234567), 0); c0 += 2;
+        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1_lane_s8(c0, vout0x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/1x16c8-minmax-neon-mull-padal.c b/src/qs8-gemm/gen/1x16c8-minmax-neon-mull-padal.c
new file mode 100644
index 0000000..8a790ed
--- /dev/null
+++ b/src/qs8-gemm/gen/1x16c8-minmax-neon-mull-padal.c
@@ -0,0 +1,228 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/c8-neon-mull-padal.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+
+  do {
+    int32x4_t vacc0x0 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x1 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x2 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x3 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x4 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x5 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x8 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x9 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x10 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x11 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x12 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x13 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x14 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x15 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+
+    size_t k = kc;
+
+    // Handle 8 bytes at a time using MUL.
+    while (k > 0) {
+      const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+
+      const int8x8_t vb0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x0 = vmull_s8(vb0, va0);
+      vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+      const int8x8_t vb1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x1 = vmull_s8(vb1, va0);
+      vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+      const int8x8_t vb2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x2 = vmull_s8(vb2, va0);
+      vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+      const int8x8_t vb3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x3 = vmull_s8(vb3, va0);
+      vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+      const int8x8_t vb4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x4 = vmull_s8(vb4, va0);
+      vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+      const int8x8_t vb5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x5 = vmull_s8(vb5, va0);
+      vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+      const int8x8_t vb6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x6 = vmull_s8(vb6, va0);
+      vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+      const int8x8_t vb7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x7 = vmull_s8(vb7, va0);
+      vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+      const int8x8_t vb8 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x8 = vmull_s8(vb8, va0);
+      vacc0x8 = vpadalq_s16(vacc0x8, vprod0x8);
+      const int8x8_t vb9 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x9 = vmull_s8(vb9, va0);
+      vacc0x9 = vpadalq_s16(vacc0x9, vprod0x9);
+      const int8x8_t vb10 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x10 = vmull_s8(vb10, va0);
+      vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10);
+      const int8x8_t vb11 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x11 = vmull_s8(vb11, va0);
+      vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11);
+      const int8x8_t vb12 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x12 = vmull_s8(vb12, va0);
+      vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12);
+      const int8x8_t vb13 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x13 = vmull_s8(vb13, va0);
+      vacc0x13 = vpadalq_s16(vacc0x13, vprod0x13);
+      const int8x8_t vb14 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x14 = vmull_s8(vb14, va0);
+      vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14);
+      const int8x8_t vb15 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x15 = vmull_s8(vb15, va0);
+      vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15);
+
+      k -= 8 * sizeof(int8_t);
+    }
+
+#if XNN_ARCH_ARM64
+    const int32x4_t vsum0x01 = vpaddq_s32(vacc0x0, vacc0x1);
+    const int32x4_t vsum0x23 = vpaddq_s32(vacc0x2, vacc0x3);
+    const int32x4_t vsum0x45 = vpaddq_s32(vacc0x4, vacc0x5);
+    const int32x4_t vsum0x67 = vpaddq_s32(vacc0x6, vacc0x7);
+    const int32x4_t vsum0x89 = vpaddq_s32(vacc0x8, vacc0x9);
+    const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11);
+    const int32x4_t vsum0xCD = vpaddq_s32(vacc0x12, vacc0x13);
+    const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15);
+    int32x4_t vacc0x0123 = vpaddq_s32(vsum0x01, vsum0x23);
+    int32x4_t vacc0x4567 = vpaddq_s32(vsum0x45, vsum0x67);
+    int32x4_t vacc0x89AB = vpaddq_s32(vsum0x89, vsum0xAB);
+    int32x4_t vacc0xCDEF = vpaddq_s32(vsum0xCD, vsum0xEF);
+#else
+    const int32x2_t vpsum0x0 = vadd_s32(vget_low_s32(vacc0x0), vget_high_s32(vacc0x0));
+    const int32x2_t vpsum0x1 = vadd_s32(vget_low_s32(vacc0x1), vget_high_s32(vacc0x1));
+    const int32x2_t vpsum0x2 = vadd_s32(vget_low_s32(vacc0x2), vget_high_s32(vacc0x2));
+    const int32x2_t vpsum0x3 = vadd_s32(vget_low_s32(vacc0x3), vget_high_s32(vacc0x3));
+    const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1);
+    const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3);
+    int32x4_t vacc0x0123 = vcombine_s32(vsum0x01, vsum0x23 );
+    const int32x2_t vpsum0x4 = vadd_s32(vget_low_s32(vacc0x4), vget_high_s32(vacc0x4));
+    const int32x2_t vpsum0x5 = vadd_s32(vget_low_s32(vacc0x5), vget_high_s32(vacc0x5));
+    const int32x2_t vpsum0x6 = vadd_s32(vget_low_s32(vacc0x6), vget_high_s32(vacc0x6));
+    const int32x2_t vpsum0x7 = vadd_s32(vget_low_s32(vacc0x7), vget_high_s32(vacc0x7));
+    const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5);
+    const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7);
+    int32x4_t vacc0x4567 = vcombine_s32(vsum0x45, vsum0x67 );
+    const int32x2_t vpsum0x8 = vadd_s32(vget_low_s32(vacc0x8), vget_high_s32(vacc0x8));
+    const int32x2_t vpsum0x9 = vadd_s32(vget_low_s32(vacc0x9), vget_high_s32(vacc0x9));
+    const int32x2_t vpsum0xA = vadd_s32(vget_low_s32(vacc0x10), vget_high_s32(vacc0x10));
+    const int32x2_t vpsum0xB = vadd_s32(vget_low_s32(vacc0x11), vget_high_s32(vacc0x11));
+    const int32x2_t vsum0x89 = vpadd_s32(vpsum0x8, vpsum0x9);
+    const int32x2_t vsum0xAB = vpadd_s32(vpsum0xA, vpsum0xB);
+    int32x4_t vacc0x89AB = vcombine_s32(vsum0x89, vsum0xAB );
+    const int32x2_t vpsum0xC = vadd_s32(vget_low_s32(vacc0x12), vget_high_s32(vacc0x12));
+    const int32x2_t vpsum0xD = vadd_s32(vget_low_s32(vacc0x13), vget_high_s32(vacc0x13));
+    const int32x2_t vpsum0xE = vadd_s32(vget_low_s32(vacc0x14), vget_high_s32(vacc0x14));
+    const int32x2_t vpsum0xF = vadd_s32(vget_low_s32(vacc0x15), vget_high_s32(vacc0x15));
+    const int32x2_t vsum0xCD = vpadd_s32(vpsum0xC, vpsum0xD);
+    const int32x2_t vsum0xEF = vpadd_s32(vpsum0xE, vpsum0xF);
+    int32x4_t vacc0xCDEF = vcombine_s32(vsum0xCD, vsum0xEF );
+#endif
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+    vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+    vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+
+    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+
+    if (nc >= 16) {
+      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
+      nc -= 16;
+    } else {
+      int8x8_t vout0x01234567 = vget_low_s8(vout0x0123456789ABCDEF);
+      if (nc & 8) {
+        vst1_s8(c0, vout0x01234567); c0 += 8;
+        vout0x01234567 = vget_high_s8(vout0x0123456789ABCDEF);
+      }
+      if (nc & 4) {
+        vst1_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpret_u32_s8(vout0x01234567), 0); c0 += 4;
+        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpret_u16_s8(vout0x01234567), 0); c0 += 2;
+        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1_lane_s8(c0, vout0x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-sse2-ld128.c b/src/qs8-gemm/gen/1x4c2-minmax-sse2-ld128.c
index 36d08bb..f0c02e1 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-sse2-ld128.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-sse2-ld128.c
@@ -12,6 +12,7 @@
 #include <emmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_1x4c2__sse2_ld128(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -99,15 +101,6 @@
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-          if (k > 6 * sizeof(int8_t)) {
-            const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-            const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-            w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-            vacc0x0123 = _mm_add_epi32(vacc0x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-          }
         }
       }
     }
@@ -163,10 +156,10 @@
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-sse2-ld64.c b/src/qs8-gemm/gen/1x4c2-minmax-sse2-ld64.c
index 96aae54..1b28605 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-sse2-ld64.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-sse2-ld64.c
@@ -12,6 +12,7 @@
 #include <emmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_1x4c2__sse2_ld64(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -99,15 +101,6 @@
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-          if (k > 6 * sizeof(int8_t)) {
-            const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-            const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-            w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-            vacc0x0123 = _mm_add_epi32(vacc0x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-          }
         }
       }
     }
@@ -163,10 +156,10 @@
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-sse41-ld128.c b/src/qs8-gemm/gen/1x4c2-minmax-sse41-ld128.c
index 45fee98..11eafd3 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-sse41-ld128.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-sse41-ld128.c
@@ -12,6 +12,7 @@
 #include <smmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -99,15 +101,6 @@
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-          if (k > 6 * sizeof(int8_t)) {
-            const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-            const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
-            w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-            vacc0x0123 = _mm_add_epi32(vacc0x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-          }
         }
       }
     }
@@ -147,10 +140,10 @@
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-sse41-ld64.c b/src/qs8-gemm/gen/1x4c2-minmax-sse41-ld64.c
index de32f50..d1bcbd1 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-sse41-ld64.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-sse41-ld64.c
@@ -12,6 +12,7 @@
 #include <smmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -99,15 +101,6 @@
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-          if (k > 6 * sizeof(int8_t)) {
-            const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-            const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
-            w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-            vacc0x0123 = _mm_add_epi32(vacc0x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-          }
         }
       }
     }
@@ -147,10 +140,10 @@
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-ssse3-ld128.c b/src/qs8-gemm/gen/1x4c2-minmax-ssse3-ld128.c
index 9e517ba..bdb39b9 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-ssse3-ld128.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-ssse3-ld128.c
@@ -12,6 +12,7 @@
 #include <tmmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_1x4c2__ssse3_ld128(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -99,15 +101,6 @@
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-          if (k > 6 * sizeof(int8_t)) {
-            const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-            const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-            w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-            vacc0x0123 = _mm_add_epi32(vacc0x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-          }
         }
       }
     }
@@ -163,10 +156,10 @@
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-ssse3-ld64.c b/src/qs8-gemm/gen/1x4c2-minmax-ssse3-ld64.c
index 7529af6..11f165f 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-ssse3-ld64.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-ssse3-ld64.c
@@ -12,6 +12,7 @@
 #include <tmmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_1x4c2__ssse3_ld64(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -99,15 +101,6 @@
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-          if (k > 6 * sizeof(int8_t)) {
-            const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-            const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-            w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-            vacc0x0123 = _mm_add_epi32(vacc0x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-          }
         }
       }
     }
@@ -163,10 +156,10 @@
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-xop-ld128.c b/src/qs8-gemm/gen/1x4c2-minmax-xop-ld128.c
index 464f22f..7473e30 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-xop-ld128.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-xop-ld128.c
@@ -17,6 +17,7 @@
 #endif
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128(
@@ -40,6 +41,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -104,15 +106,6 @@
 
           vacc0x0123 = _mm_maddd_epi16(
             _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
-
-          if (k > 6 * sizeof(int8_t)) {
-            const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-            const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
-            w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-            vacc0x0123 = _mm_maddd_epi16(
-              _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc0x0123);
-          }
         }
       }
     }
@@ -152,10 +145,10 @@
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-xop-ld64.c b/src/qs8-gemm/gen/1x4c2-minmax-xop-ld64.c
index 8038248..ece3f81 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-xop-ld64.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-xop-ld64.c
@@ -17,6 +17,7 @@
 #endif
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64(
@@ -40,6 +41,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -104,15 +106,6 @@
 
           vacc0x0123 = _mm_maddd_epi16(
             _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
-
-          if (k > 6 * sizeof(int8_t)) {
-            const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-            const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
-            w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-            vacc0x0123 = _mm_maddd_epi16(
-              _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc0x0123);
-          }
         }
       }
     }
@@ -152,10 +145,10 @@
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/1x4c2-xw-minmax-sse2.c b/src/qs8-gemm/gen/1x4c2-xw-minmax-sse2.c
index 70c7dd0..6c6dd0d 100644
--- a/src/qs8-gemm/gen/1x4c2-xw-minmax-sse2.c
+++ b/src/qs8-gemm/gen/1x4c2-xw-minmax-sse2.c
@@ -12,6 +12,7 @@
 #include <emmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse2(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -92,14 +94,6 @@
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-          if (k > 6 * sizeof(int8_t)) {
-            const __m128i vxb3 = _mm_load_si128((const __m128i*) w);
-            w = (const void*) ((uintptr_t) w + 8 * sizeof(int16_t));
-
-            vacc0x0123 = _mm_add_epi32(vacc0x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-          }
         }
       }
     }
@@ -155,10 +149,10 @@
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/1x4c2-xw-minmax-sse41.c b/src/qs8-gemm/gen/1x4c2-xw-minmax-sse41.c
index d5fa2c3..2783338 100644
--- a/src/qs8-gemm/gen/1x4c2-xw-minmax-sse41.c
+++ b/src/qs8-gemm/gen/1x4c2-xw-minmax-sse41.c
@@ -12,6 +12,7 @@
 #include <smmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -92,14 +94,6 @@
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-          if (k > 6 * sizeof(int8_t)) {
-            const __m128i vxb3 = _mm_load_si128((const __m128i*) w);
-            w = (const void*) ((uintptr_t) w + 8 * sizeof(int16_t));
-
-            vacc0x0123 = _mm_add_epi32(vacc0x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-          }
         }
       }
     }
@@ -139,10 +133,10 @@
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/1x4c2-xw-minmax-ssse3.c b/src/qs8-gemm/gen/1x4c2-xw-minmax-ssse3.c
index 40b3ea8..6019e9c 100644
--- a/src/qs8-gemm/gen/1x4c2-xw-minmax-ssse3.c
+++ b/src/qs8-gemm/gen/1x4c2-xw-minmax-ssse3.c
@@ -12,6 +12,7 @@
 #include <tmmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__ssse3(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -92,14 +94,6 @@
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-          if (k > 6 * sizeof(int8_t)) {
-            const __m128i vxb3 = _mm_load_si128((const __m128i*) w);
-            w = (const void*) ((uintptr_t) w + 8 * sizeof(int16_t));
-
-            vacc0x0123 = _mm_add_epi32(vacc0x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-          }
         }
       }
     }
@@ -155,10 +149,10 @@
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/1x4c2-xw-minmax-xop.c b/src/qs8-gemm/gen/1x4c2-xw-minmax-xop.c
index 624ac2b..b4a2490 100644
--- a/src/qs8-gemm/gen/1x4c2-xw-minmax-xop.c
+++ b/src/qs8-gemm/gen/1x4c2-xw-minmax-xop.c
@@ -17,6 +17,7 @@
 #endif
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop(
@@ -40,6 +41,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -97,14 +99,6 @@
 
           vacc0x0123 = _mm_maddd_epi16(
             _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
-
-          if (k > 6 * sizeof(int8_t)) {
-            const __m128i vxb3 = _mm_load_si128((const __m128i*) w);
-            w = (const void*) ((uintptr_t) w + 8 * sizeof(int16_t));
-
-            vacc0x0123 = _mm_maddd_epi16(
-              _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc0x0123);
-          }
         }
       }
     }
@@ -144,10 +138,10 @@
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-sse2-ld128.c b/src/qs8-gemm/gen/1x4c8-minmax-sse2-ld128.c
index 87d81ff..135dcdb 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-sse2-ld128.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-sse2-ld128.c
@@ -12,6 +12,7 @@
 #include <emmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_1x4c8__sse2_ld128(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -126,10 +128,10 @@
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-sse2-ld64.c b/src/qs8-gemm/gen/1x4c8-minmax-sse2-ld64.c
index 3467c1e..15bd4fd 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-sse2-ld64.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-sse2-ld64.c
@@ -12,6 +12,7 @@
 #include <emmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_1x4c8__sse2_ld64(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -128,10 +130,10 @@
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-sse41-ld128.c b/src/qs8-gemm/gen/1x4c8-minmax-sse41-ld128.c
index e7b623c..df8bf83 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-sse41-ld128.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-sse41-ld128.c
@@ -12,6 +12,7 @@
 #include <smmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -110,10 +112,10 @@
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-sse41-ld64.c b/src/qs8-gemm/gen/1x4c8-minmax-sse41-ld64.c
index 6db42e2..f2e0e3a 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-sse41-ld64.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-sse41-ld64.c
@@ -12,6 +12,7 @@
 #include <smmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -112,10 +114,10 @@
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-ssse3-ld128.c b/src/qs8-gemm/gen/1x4c8-minmax-ssse3-ld128.c
index d0323dd..5f90494 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-ssse3-ld128.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-ssse3-ld128.c
@@ -12,6 +12,7 @@
 #include <tmmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_1x4c8__ssse3_ld128(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -126,10 +128,10 @@
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-ssse3-ld64.c b/src/qs8-gemm/gen/1x4c8-minmax-ssse3-ld64.c
index c8cd8ad..7031314 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-ssse3-ld64.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-ssse3-ld64.c
@@ -12,6 +12,7 @@
 #include <tmmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_1x4c8__ssse3_ld64(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -128,10 +130,10 @@
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-wasmsimd-ld128.c b/src/qs8-gemm/gen/1x4c8-minmax-wasmsimd-ld128.c
index ba3e4f1..e14c5d1 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-wasmsimd-ld128.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-wasmsimd-ld128.c
@@ -12,6 +12,7 @@
 #include <wasm_simd128.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_1x4c8__wasmsimd_ld128(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -119,10 +121,10 @@
     if (nc >= 4) {
       *((float*) c0) = (float) wasm_f32x4_extract_lane(vout, 0);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-wasmsimd-ld64.c b/src/qs8-gemm/gen/1x4c8-minmax-wasmsimd-ld64.c
index 428426a..3ea0022 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-wasmsimd-ld64.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-wasmsimd-ld64.c
@@ -12,6 +12,7 @@
 #include <wasm_simd128.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_1x4c8__wasmsimd_ld64(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -115,10 +117,10 @@
     if (nc >= 4) {
       *((float*) c0) = (float) wasm_f32x4_extract_lane(vout, 0);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-xop-ld128.c b/src/qs8-gemm/gen/1x4c8-minmax-xop-ld128.c
index e89b0a2..5411da3 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-xop-ld128.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-xop-ld128.c
@@ -17,6 +17,7 @@
 #endif
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128(
@@ -40,6 +41,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -115,10 +117,10 @@
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-xop-ld64.c b/src/qs8-gemm/gen/1x4c8-minmax-xop-ld64.c
index f281502..6bab96c 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-xop-ld64.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-xop-ld64.c
@@ -17,6 +17,7 @@
 #endif
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64(
@@ -40,6 +41,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -117,10 +119,10 @@
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/1x4c8-xw-minmax-sse2.c b/src/qs8-gemm/gen/1x4c8-xw-minmax-sse2.c
index 84cd542..4d50667 100644
--- a/src/qs8-gemm/gen/1x4c8-xw-minmax-sse2.c
+++ b/src/qs8-gemm/gen/1x4c8-xw-minmax-sse2.c
@@ -12,6 +12,7 @@
 #include <emmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse2(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -124,10 +126,10 @@
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/1x4c8-xw-minmax-sse41.c b/src/qs8-gemm/gen/1x4c8-xw-minmax-sse41.c
index 59b547c..3dc5959 100644
--- a/src/qs8-gemm/gen/1x4c8-xw-minmax-sse41.c
+++ b/src/qs8-gemm/gen/1x4c8-xw-minmax-sse41.c
@@ -12,6 +12,7 @@
 #include <smmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -108,10 +110,10 @@
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/1x4c8-xw-minmax-ssse3.c b/src/qs8-gemm/gen/1x4c8-xw-minmax-ssse3.c
index 6e1f9c9..31d4fc5 100644
--- a/src/qs8-gemm/gen/1x4c8-xw-minmax-ssse3.c
+++ b/src/qs8-gemm/gen/1x4c8-xw-minmax-ssse3.c
@@ -12,6 +12,7 @@
 #include <tmmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__ssse3(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -124,10 +126,10 @@
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/1x4c8-xw-minmax-wasmsimd.c b/src/qs8-gemm/gen/1x4c8-xw-minmax-wasmsimd.c
index 7cc7643..ffb826b 100644
--- a/src/qs8-gemm/gen/1x4c8-xw-minmax-wasmsimd.c
+++ b/src/qs8-gemm/gen/1x4c8-xw-minmax-wasmsimd.c
@@ -12,6 +12,7 @@
 #include <wasm_simd128.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__wasmsimd(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -115,10 +117,10 @@
     if (nc >= 4) {
       *((float*) c0) = (float) wasm_f32x4_extract_lane(vout, 0);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/1x4c8-xw-minmax-xop.c b/src/qs8-gemm/gen/1x4c8-xw-minmax-xop.c
index 29349f9..d4f4020 100644
--- a/src/qs8-gemm/gen/1x4c8-xw-minmax-xop.c
+++ b/src/qs8-gemm/gen/1x4c8-xw-minmax-xop.c
@@ -17,6 +17,7 @@
 #endif
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop(
@@ -40,6 +41,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -113,10 +115,10 @@
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/1x8-minmax-neon-mull-addw-dup.c b/src/qs8-gemm/gen/1x8-minmax-neon-mull-addw-dup.c
new file mode 100644
index 0000000..2e98ffc
--- /dev/null
+++ b/src/qs8-gemm/gen/1x8-minmax-neon-mull-addw-dup.c
@@ -0,0 +1,203 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/neon-mull-addw-dup.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/gemm.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = kc;
+    while (k >= 8 * sizeof(int8_t)) {
+      const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+
+      const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c0));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c0));
+      const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c1));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c1));
+      const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va0, 2));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c2));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c2));
+      const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3));
+      const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va0, 4));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c4));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c4));
+      const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va0, 5));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c5));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c5));
+      const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va0, 6));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c6));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c6));
+      const int8x8_t vb01234567c7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va0, 7));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c7));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c7));
+
+      k -= 8 * sizeof(int8_t);
+    }
+    if XNN_UNLIKELY(k != 0) {
+      const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+
+      const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c0));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c0));
+
+      if (k >= 2 * sizeof(int8_t)) {
+        const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c1));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c1));
+
+        if (k > 2 * sizeof(int8_t)) {
+          const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          const int16x8_t vprod0x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va0, 2));
+          vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c2));
+          vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c2));
+
+          if (k >= 4 * sizeof(int8_t)) {
+            const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+            const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3));
+            vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3));
+            vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3));
+
+            if (k > 4 * sizeof(int8_t)) {
+              const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+              const int16x8_t vprod0x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va0, 4));
+              vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c4));
+              vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c4));
+
+              if (k >= 6 * sizeof(int8_t)) {
+                const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+                const int16x8_t vprod0x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va0, 5));
+                vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c5));
+                vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c5));
+
+                if (k > 6 * sizeof(int8_t)) {
+                  const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+                  const int16x8_t vprod0x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va0, 6));
+                  vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c6));
+                  vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c6));
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+
+    int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+
+    int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567);
+#endif
+    const int8x8_t voutput_min = vld1_dup_s8(&params->neon.output_min);
+    const int8x8_t voutput_max = vld1_dup_s8(&params->neon.output_max);
+
+    vout0x01234567 = vmax_s8(vout0x01234567, voutput_min);
+
+    vout0x01234567 = vmin_s8(vout0x01234567, voutput_max);
+
+    if (nc >= 8) {
+      vst1_s8(c0 + 0, vout0x01234567);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        vst1_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpret_u32_s8(vout0x01234567), 0); c0 += 4;
+        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpret_u16_s8(vout0x01234567), 0); c0 += 2;
+        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1_lane_s8(c0, vout0x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/1x8c16-minmax-neon-mlal-padal.c b/src/qs8-gemm/gen/1x8c16-minmax-neon-mlal-padal.c
new file mode 100644
index 0000000..5bf0eca
--- /dev/null
+++ b/src/qs8-gemm/gen/1x8c16-minmax-neon-mlal-padal.c
@@ -0,0 +1,171 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/c16-neon-mlal-padal.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 16);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+
+  do {
+    int32x4_t vacc0x0 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x1 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x2 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x3 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x4 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x5 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+
+    // KC loop of 16
+    size_t k = 0;
+    while (k < kc) {
+      const int8x16_t va0 = vld1q_s8(a0); a0 += 16;
+
+      const int8x16_t vb0 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb1 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb2 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb3 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb4 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb5 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb6 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb7 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+
+      int16x8_t vprod0x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va0));
+      vprod0x0 = vmlal_s8(vprod0x0, vget_high_s8(vb0), vget_high_s8(va0));
+      vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+      int16x8_t vprod0x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va0));
+      vprod0x1 = vmlal_s8(vprod0x1, vget_high_s8(vb1), vget_high_s8(va0));
+      vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+      int16x8_t vprod0x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va0));
+      vprod0x2 = vmlal_s8(vprod0x2, vget_high_s8(vb2), vget_high_s8(va0));
+      vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+      int16x8_t vprod0x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va0));
+      vprod0x3 = vmlal_s8(vprod0x3, vget_high_s8(vb3), vget_high_s8(va0));
+      vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+      int16x8_t vprod0x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va0));
+      vprod0x4 = vmlal_s8(vprod0x4, vget_high_s8(vb4), vget_high_s8(va0));
+      vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+      int16x8_t vprod0x5 = vmull_s8(vget_low_s8(vb5), vget_low_s8(va0));
+      vprod0x5 = vmlal_s8(vprod0x5, vget_high_s8(vb5), vget_high_s8(va0));
+      vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+      int16x8_t vprod0x6 = vmull_s8(vget_low_s8(vb6), vget_low_s8(va0));
+      vprod0x6 = vmlal_s8(vprod0x6, vget_high_s8(vb6), vget_high_s8(va0));
+      vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+      int16x8_t vprod0x7 = vmull_s8(vget_low_s8(vb7), vget_low_s8(va0));
+      vprod0x7 = vmlal_s8(vprod0x7, vget_high_s8(vb7), vget_high_s8(va0));
+      vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+
+      k += 16 * sizeof(int8_t);
+    }
+
+#if XNN_ARCH_ARM64
+    const int32x4_t vsum0x01 = vpaddq_s32(vacc0x0, vacc0x1);
+    const int32x4_t vsum0x23 = vpaddq_s32(vacc0x2, vacc0x3);
+    const int32x4_t vsum0x45 = vpaddq_s32(vacc0x4, vacc0x5);
+    const int32x4_t vsum0x67 = vpaddq_s32(vacc0x6, vacc0x7);
+    int32x4_t vacc0x0123 = vpaddq_s32(vsum0x01, vsum0x23);
+    int32x4_t vacc0x4567 = vpaddq_s32(vsum0x45, vsum0x67);
+#else
+    const int32x2_t vpsum0x0 = vadd_s32(vget_low_s32(vacc0x0), vget_high_s32(vacc0x0));
+    const int32x2_t vpsum0x1 = vadd_s32(vget_low_s32(vacc0x1), vget_high_s32(vacc0x1));
+    const int32x2_t vpsum0x2 = vadd_s32(vget_low_s32(vacc0x2), vget_high_s32(vacc0x2));
+    const int32x2_t vpsum0x3 = vadd_s32(vget_low_s32(vacc0x3), vget_high_s32(vacc0x3));
+    const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1);
+    const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3);
+    int32x4_t vacc0x0123 = vcombine_s32(vsum0x01, vsum0x23 );
+    const int32x2_t vpsum0x4 = vadd_s32(vget_low_s32(vacc0x4), vget_high_s32(vacc0x4));
+    const int32x2_t vpsum0x5 = vadd_s32(vget_low_s32(vacc0x5), vget_high_s32(vacc0x5));
+    const int32x2_t vpsum0x6 = vadd_s32(vget_low_s32(vacc0x6), vget_high_s32(vacc0x6));
+    const int32x2_t vpsum0x7 = vadd_s32(vget_low_s32(vacc0x7), vget_high_s32(vacc0x7));
+    const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5);
+    const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7);
+    int32x4_t vacc0x4567 = vcombine_s32(vsum0x45, vsum0x67 );
+#endif
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+
+    int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567);
+#endif
+    const int8x8_t voutput_min = vld1_dup_s8(&params->neon.output_min);
+    const int8x8_t voutput_max = vld1_dup_s8(&params->neon.output_max);
+
+    vout0x01234567 = vmax_s8(vout0x01234567, voutput_min);
+
+    vout0x01234567 = vmin_s8(vout0x01234567, voutput_max);
+
+    if (nc >= 8) {
+      vst1_s8(c0 + 0, vout0x01234567);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        vst1_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpret_u32_s8(vout0x01234567), 0); c0 += 4;
+        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpret_u16_s8(vout0x01234567), 0); c0 += 2;
+        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1_lane_s8(c0, vout0x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/1x8c2-minmax-neon-mlal-padal-dup.c b/src/qs8-gemm/gen/1x8c2-minmax-neon-mlal-padal-dup.c
new file mode 100644
index 0000000..378b1b4
--- /dev/null
+++ b/src/qs8-gemm/gen/1x8c2-minmax-neon-mlal-padal-dup.c
@@ -0,0 +1,214 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/c2-neon-mull-padal-dup.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = kc;
+
+    while (k >= 16 * sizeof(int8_t)) {
+      const int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va0x1 = vld1_s8(a0); a0 += 8;
+
+      const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+      const int8x8_t vb0123c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+      int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+      const int8x8_t vb4567c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+      int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+      const int8x8_t vb0123c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x0123c1 = vmlal_s8(vprod0x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+      int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+      const int8x8_t vb4567c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x4567c1 = vmlal_s8(vprod0x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+      int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+      const int8x8_t vb0123c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x0123c2 = vmlal_s8(vprod0x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+      int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+      const int8x8_t vb4567c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x4567c2 = vmlal_s8(vprod0x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+      int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+      const int8x8_t vb0123c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x0123c3 = vmlal_s8(vprod0x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+      int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+      const int8x8_t vb4567c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x4567c3 = vmlal_s8(vprod0x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+
+      k -= 16 * sizeof(int8_t);
+    }
+
+    if (k >= 8 * sizeof(int8_t)) {
+      const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+
+      const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+      const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+      const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+      const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+      const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+      const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+
+      k -= 8 * sizeof(int8_t);
+    }
+
+    if XNN_UNLIKELY(k != 0) {
+      const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+
+      const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+      const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+
+      if (k > 2 * sizeof(int8_t)) {
+        const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+        const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+
+        if (k > 4 * sizeof(int8_t)) {
+          const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+          vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+          const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+          vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+        }
+      }
+    }
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+
+    int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+
+    int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567);
+#endif
+    const int8x8_t voutput_min = vld1_dup_s8(&params->neon.output_min);
+    const int8x8_t voutput_max = vld1_dup_s8(&params->neon.output_max);
+
+    vout0x01234567 = vmax_s8(vout0x01234567, voutput_min);
+
+    vout0x01234567 = vmin_s8(vout0x01234567, voutput_max);
+
+    if (nc >= 8) {
+      vst1_s8(c0 + 0, vout0x01234567);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        vst1_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpret_u32_s8(vout0x01234567), 0); c0 += 4;
+        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpret_u16_s8(vout0x01234567), 0); c0 += 2;
+        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1_lane_s8(c0, vout0x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/1x8c2-minmax-neon-mull-padal-dup.c b/src/qs8-gemm/gen/1x8c2-minmax-neon-mull-padal-dup.c
new file mode 100644
index 0000000..9576c96
--- /dev/null
+++ b/src/qs8-gemm/gen/1x8c2-minmax-neon-mull-padal-dup.c
@@ -0,0 +1,166 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/c2-neon-mull-padal-dup.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mull_padal_dup(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = kc;
+
+
+    while (k >= 8 * sizeof(int8_t)) {
+      const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+
+      const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+      const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+      const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+      const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+      const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+      const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+
+      k -= 8 * sizeof(int8_t);
+    }
+
+    if XNN_UNLIKELY(k != 0) {
+      const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+
+      const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+      const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+
+      if (k > 2 * sizeof(int8_t)) {
+        const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+        const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+
+        if (k > 4 * sizeof(int8_t)) {
+          const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+          vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+          const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+          vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+        }
+      }
+    }
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+
+    int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+
+    int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567);
+#endif
+    const int8x8_t voutput_min = vld1_dup_s8(&params->neon.output_min);
+    const int8x8_t voutput_max = vld1_dup_s8(&params->neon.output_max);
+
+    vout0x01234567 = vmax_s8(vout0x01234567, voutput_min);
+
+    vout0x01234567 = vmin_s8(vout0x01234567, voutput_max);
+
+    if (nc >= 8) {
+      vst1_s8(c0 + 0, vout0x01234567);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        vst1_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpret_u32_s8(vout0x01234567), 0); c0 += 4;
+        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpret_u16_s8(vout0x01234567), 0); c0 += 2;
+        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1_lane_s8(c0, vout0x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/1x8c4-minmax-neondot.c b/src/qs8-gemm/gen/1x8c4-minmax-neondot.c
index 3ce1f0d..c6925c1 100644
--- a/src/qs8-gemm/gen/1x8c4-minmax-neondot.c
+++ b/src/qs8-gemm/gen/1x8c4-minmax-neondot.c
@@ -7,18 +7,14 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-
 #include <assert.h>
 
 #include <arm_neon.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
-// This kernel uses ARMv8.2 dot-product instructions.
-//
-// Scalar model: xnn_qs8_gemm_minmax_ukernel_1x8c4__scalar. Refer to
-// that kernel for more comments.
 void xnn_qs8_gemm_minmax_ukernel_1x8c4__neondot(
     size_t mr,
     size_t nc,
@@ -34,7 +30,12 @@
   assert(mr <= 1);
   assert(nc != 0);
   assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
 
+  kc = round_up_po2(kc, 4);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -53,10 +54,10 @@
       const int8x8_t va0x01234567 = vld1_s8(a0); a0 += 8;
 
       // Load a 8x8 block of weights.
-      const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
+      const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
 
       // Multiply-accumulate: 1x8 * 8x8 --> 1x8.
       vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb0123x0123, va0x01234567, 0);
@@ -66,35 +67,21 @@
 
       k -= 8 * sizeof(int8_t);
     }
-    // Handle up to 7 final positions of `k`
+    // Handle up to 4 final positions of `k`
     if XNN_UNLIKELY(k != 0) {
       // Load a 1x4 block of activations.
-      const int8x8_t va0x01234567 = vld1_s8(a0); a0 += k;
+      const int8x8_t va0x01234567 = vld1_s8(a0); a0 += 4;
 
       // Load a 4x8 block of weights.
-      const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
+      const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
 
       // Multiply-accumulate: 1x4 * 4x8 --> 1x8.
       vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb0123x0123, va0x01234567, 0);
       vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb0123x4567, va0x01234567, 0);
-
-      if (k > 4) {
-        // Load a 4x8 block of weights.
-        const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-
-        // Multiply-accumulate: 1x4 * 4x8 --> 1x8.
-        vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
-        vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
-      }
     }
-    // End of accumulation loop. The variable `kc` contains the amount by which
-    // we advanced the `va` pointers, so we rewind by this amount now.
-    a0 = (const int8_t*)((uintptr_t)a0 - kc);
 
     // Post-accumulation work
-
     const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
     const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
 
@@ -131,6 +118,8 @@
       // Advance to the next 8 columns.
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 8;
     } else {
       // Final case where not all of the 8 columns fit in the destination.
diff --git a/src/qs8-gemm/gen/1x8c8-minmax-avx2.c b/src/qs8-gemm/gen/1x8c8-minmax-avx2.c
index 29b2e86..adbc9b5 100644
--- a/src/qs8-gemm/gen/1x8c8-minmax-avx2.c
+++ b/src/qs8-gemm/gen/1x8c8-minmax-avx2.c
@@ -13,6 +13,7 @@
 
 #include <xnnpack/gemm.h>
 #include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2(
@@ -36,6 +37,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -128,10 +130,10 @@
     if (nc >= 8) {
       _mm_storel_epi64((__m128i*) c0, vout_lo);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 8;
     } else {
       if (nc & 4) {
diff --git a/src/qs8-gemm/gen/1x8c8-minmax-neon-mlal-padal.c b/src/qs8-gemm/gen/1x8c8-minmax-neon-mlal-padal.c
new file mode 100644
index 0000000..52524d5
--- /dev/null
+++ b/src/qs8-gemm/gen/1x8c8-minmax-neon-mlal-padal.c
@@ -0,0 +1,212 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/c8-neon-mull-padal.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+
+  do {
+    int32x4_t vacc0x0 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x1 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x2 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x3 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x4 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x5 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+
+    size_t k = kc;
+    // 2x partial unrolled loop to load 16 bytes at a time using MLA.
+    while (k >= 16 * sizeof(int8_t)) {
+      const int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va0x1 = vld1_s8(a0); a0 += 8;
+
+      const int8x8_t vb0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb4x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb5x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb6x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb7x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+
+      const int8x8_t vb0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x0 = vmull_s8(vb0x0, va0x0);
+      vprod0x0 = vmlal_s8(vprod0x0, vb0x1, va0x1);
+      vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+      const int8x8_t vb1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x1 = vmull_s8(vb1x0, va0x0);
+      vprod0x1 = vmlal_s8(vprod0x1, vb1x1, va0x1);
+      vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+      const int8x8_t vb2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x2 = vmull_s8(vb2x0, va0x0);
+      vprod0x2 = vmlal_s8(vprod0x2, vb2x1, va0x1);
+      vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+      const int8x8_t vb3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x3 = vmull_s8(vb3x0, va0x0);
+      vprod0x3 = vmlal_s8(vprod0x3, vb3x1, va0x1);
+      vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+      const int8x8_t vb4x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x4 = vmull_s8(vb4x0, va0x0);
+      vprod0x4 = vmlal_s8(vprod0x4, vb4x1, va0x1);
+      vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+      const int8x8_t vb5x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x5 = vmull_s8(vb5x0, va0x0);
+      vprod0x5 = vmlal_s8(vprod0x5, vb5x1, va0x1);
+      vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+      const int8x8_t vb6x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x6 = vmull_s8(vb6x0, va0x0);
+      vprod0x6 = vmlal_s8(vprod0x6, vb6x1, va0x1);
+      vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+      const int8x8_t vb7x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x7 = vmull_s8(vb7x0, va0x0);
+      vprod0x7 = vmlal_s8(vprod0x7, vb7x1, va0x1);
+      vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+
+      k -= 16 * sizeof(int8_t);
+    }
+
+    // Handle 8 bytes at a time using MUL.
+    if (k > 0) {
+      const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+
+      const int8x8_t vb0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x0 = vmull_s8(vb0, va0);
+      vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+      const int8x8_t vb1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x1 = vmull_s8(vb1, va0);
+      vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+      const int8x8_t vb2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x2 = vmull_s8(vb2, va0);
+      vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+      const int8x8_t vb3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x3 = vmull_s8(vb3, va0);
+      vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+      const int8x8_t vb4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x4 = vmull_s8(vb4, va0);
+      vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+      const int8x8_t vb5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x5 = vmull_s8(vb5, va0);
+      vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+      const int8x8_t vb6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x6 = vmull_s8(vb6, va0);
+      vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+      const int8x8_t vb7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x7 = vmull_s8(vb7, va0);
+      vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+
+      k -= 8 * sizeof(int8_t);
+    }
+
+#if XNN_ARCH_ARM64
+    const int32x4_t vsum0x01 = vpaddq_s32(vacc0x0, vacc0x1);
+    const int32x4_t vsum0x23 = vpaddq_s32(vacc0x2, vacc0x3);
+    const int32x4_t vsum0x45 = vpaddq_s32(vacc0x4, vacc0x5);
+    const int32x4_t vsum0x67 = vpaddq_s32(vacc0x6, vacc0x7);
+    int32x4_t vacc0x0123 = vpaddq_s32(vsum0x01, vsum0x23);
+    int32x4_t vacc0x4567 = vpaddq_s32(vsum0x45, vsum0x67);
+#else
+    const int32x2_t vpsum0x0 = vadd_s32(vget_low_s32(vacc0x0), vget_high_s32(vacc0x0));
+    const int32x2_t vpsum0x1 = vadd_s32(vget_low_s32(vacc0x1), vget_high_s32(vacc0x1));
+    const int32x2_t vpsum0x2 = vadd_s32(vget_low_s32(vacc0x2), vget_high_s32(vacc0x2));
+    const int32x2_t vpsum0x3 = vadd_s32(vget_low_s32(vacc0x3), vget_high_s32(vacc0x3));
+    const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1);
+    const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3);
+    int32x4_t vacc0x0123 = vcombine_s32(vsum0x01, vsum0x23 );
+    const int32x2_t vpsum0x4 = vadd_s32(vget_low_s32(vacc0x4), vget_high_s32(vacc0x4));
+    const int32x2_t vpsum0x5 = vadd_s32(vget_low_s32(vacc0x5), vget_high_s32(vacc0x5));
+    const int32x2_t vpsum0x6 = vadd_s32(vget_low_s32(vacc0x6), vget_high_s32(vacc0x6));
+    const int32x2_t vpsum0x7 = vadd_s32(vget_low_s32(vacc0x7), vget_high_s32(vacc0x7));
+    const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5);
+    const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7);
+    int32x4_t vacc0x4567 = vcombine_s32(vsum0x45, vsum0x67 );
+#endif
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+
+    int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567);
+#endif
+    const int8x8_t voutput_min = vld1_dup_s8(&params->neon.output_min);
+    const int8x8_t voutput_max = vld1_dup_s8(&params->neon.output_max);
+
+    vout0x01234567 = vmax_s8(vout0x01234567, voutput_min);
+
+    vout0x01234567 = vmin_s8(vout0x01234567, voutput_max);
+
+    if (nc >= 8) {
+      vst1_s8(c0 + 0, vout0x01234567);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        vst1_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpret_u32_s8(vout0x01234567), 0); c0 += 4;
+        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpret_u16_s8(vout0x01234567), 0); c0 += 2;
+        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1_lane_s8(c0, vout0x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/1x8c8-minmax-neon-mull-padal.c b/src/qs8-gemm/gen/1x8c8-minmax-neon-mull-padal.c
new file mode 100644
index 0000000..7a00b9a
--- /dev/null
+++ b/src/qs8-gemm/gen/1x8c8-minmax-neon-mull-padal.c
@@ -0,0 +1,163 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/c8-neon-mull-padal.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mull_padal(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+
+  do {
+    int32x4_t vacc0x0 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x1 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x2 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x3 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x4 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x5 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+
+    size_t k = kc;
+
+    // Handle 8 bytes at a time using MUL.
+    while (k > 0) {
+      const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+
+      const int8x8_t vb0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x0 = vmull_s8(vb0, va0);
+      vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+      const int8x8_t vb1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x1 = vmull_s8(vb1, va0);
+      vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+      const int8x8_t vb2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x2 = vmull_s8(vb2, va0);
+      vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+      const int8x8_t vb3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x3 = vmull_s8(vb3, va0);
+      vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+      const int8x8_t vb4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x4 = vmull_s8(vb4, va0);
+      vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+      const int8x8_t vb5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x5 = vmull_s8(vb5, va0);
+      vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+      const int8x8_t vb6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x6 = vmull_s8(vb6, va0);
+      vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+      const int8x8_t vb7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x7 = vmull_s8(vb7, va0);
+      vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+
+      k -= 8 * sizeof(int8_t);
+    }
+
+#if XNN_ARCH_ARM64
+    const int32x4_t vsum0x01 = vpaddq_s32(vacc0x0, vacc0x1);
+    const int32x4_t vsum0x23 = vpaddq_s32(vacc0x2, vacc0x3);
+    const int32x4_t vsum0x45 = vpaddq_s32(vacc0x4, vacc0x5);
+    const int32x4_t vsum0x67 = vpaddq_s32(vacc0x6, vacc0x7);
+    int32x4_t vacc0x0123 = vpaddq_s32(vsum0x01, vsum0x23);
+    int32x4_t vacc0x4567 = vpaddq_s32(vsum0x45, vsum0x67);
+#else
+    const int32x2_t vpsum0x0 = vadd_s32(vget_low_s32(vacc0x0), vget_high_s32(vacc0x0));
+    const int32x2_t vpsum0x1 = vadd_s32(vget_low_s32(vacc0x1), vget_high_s32(vacc0x1));
+    const int32x2_t vpsum0x2 = vadd_s32(vget_low_s32(vacc0x2), vget_high_s32(vacc0x2));
+    const int32x2_t vpsum0x3 = vadd_s32(vget_low_s32(vacc0x3), vget_high_s32(vacc0x3));
+    const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1);
+    const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3);
+    int32x4_t vacc0x0123 = vcombine_s32(vsum0x01, vsum0x23 );
+    const int32x2_t vpsum0x4 = vadd_s32(vget_low_s32(vacc0x4), vget_high_s32(vacc0x4));
+    const int32x2_t vpsum0x5 = vadd_s32(vget_low_s32(vacc0x5), vget_high_s32(vacc0x5));
+    const int32x2_t vpsum0x6 = vadd_s32(vget_low_s32(vacc0x6), vget_high_s32(vacc0x6));
+    const int32x2_t vpsum0x7 = vadd_s32(vget_low_s32(vacc0x7), vget_high_s32(vacc0x7));
+    const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5);
+    const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7);
+    int32x4_t vacc0x4567 = vcombine_s32(vsum0x45, vsum0x67 );
+#endif
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+
+    int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567);
+#endif
+    const int8x8_t voutput_min = vld1_dup_s8(&params->neon.output_min);
+    const int8x8_t voutput_max = vld1_dup_s8(&params->neon.output_max);
+
+    vout0x01234567 = vmax_s8(vout0x01234567, voutput_min);
+
+    vout0x01234567 = vmin_s8(vout0x01234567, voutput_max);
+
+    if (nc >= 8) {
+      vst1_s8(c0 + 0, vout0x01234567);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        vst1_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpret_u32_s8(vout0x01234567), 0); c0 += 4;
+        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpret_u16_s8(vout0x01234567), 0); c0 += 2;
+        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1_lane_s8(c0, vout0x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/1x8c8-xw-minmax-avx2.c b/src/qs8-gemm/gen/1x8c8-xw-minmax-avx2.c
index e039738..9d51b83 100644
--- a/src/qs8-gemm/gen/1x8c8-xw-minmax-avx2.c
+++ b/src/qs8-gemm/gen/1x8c8-xw-minmax-avx2.c
@@ -13,6 +13,7 @@
 
 #include <xnnpack/gemm.h>
 #include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2(
@@ -36,6 +37,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -124,10 +126,10 @@
     if (nc >= 8) {
       _mm_storel_epi64((__m128i*) c0, vout_lo);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 8;
     } else {
       if (nc & 4) {
diff --git a/src/qs8-gemm/gen/2x16-minmax-neon-mull-addw-dup.c b/src/qs8-gemm/gen/2x16-minmax-neon-mull-addw-dup.c
new file mode 100644
index 0000000..5850903
--- /dev/null
+++ b/src/qs8-gemm/gen/2x16-minmax-neon-mull-addw-dup.c
@@ -0,0 +1,415 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/neon-mull-addw-dup.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/gemm.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc1x0123 = vacc0x0123;
+    int32x4_t vacc1x4567 = vacc0x4567;
+    int32x4_t vacc1x89AB = vacc0x89AB;
+    int32x4_t vacc1xCDEF = vacc0xCDEF;
+
+    size_t k = kc;
+    while (k >= 8 * sizeof(int8_t)) {
+      const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+
+      const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c0));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c0));
+      const int16x8_t vprod1x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va1, 0));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c0));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c0));
+      const int8x8_t vb89ABCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va0, 0));
+      vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc0));
+      vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc0));
+      const int16x8_t vprod1x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va1, 0));
+      vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc0));
+      vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc0));
+      const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c1));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c1));
+      const int16x8_t vprod1x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va1, 1));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c1));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c1));
+      const int8x8_t vb89ABCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va0, 1));
+      vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc1));
+      vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc1));
+      const int16x8_t vprod1x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va1, 1));
+      vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc1));
+      vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc1));
+      const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va0, 2));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c2));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c2));
+      const int16x8_t vprod1x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va1, 2));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c2));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c2));
+      const int8x8_t vb89ABCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va0, 2));
+      vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc2));
+      vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc2));
+      const int16x8_t vprod1x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va1, 2));
+      vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc2));
+      vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc2));
+      const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3));
+      const int16x8_t vprod1x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va1, 3));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c3));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c3));
+      const int8x8_t vb89ABCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va0, 3));
+      vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc3));
+      vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc3));
+      const int16x8_t vprod1x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va1, 3));
+      vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc3));
+      vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc3));
+      const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va0, 4));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c4));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c4));
+      const int16x8_t vprod1x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va1, 4));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c4));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c4));
+      const int8x8_t vb89ABCDEFc4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va0, 4));
+      vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc4));
+      vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc4));
+      const int16x8_t vprod1x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va1, 4));
+      vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc4));
+      vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc4));
+      const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va0, 5));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c5));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c5));
+      const int16x8_t vprod1x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va1, 5));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c5));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c5));
+      const int8x8_t vb89ABCDEFc5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va0, 5));
+      vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc5));
+      vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc5));
+      const int16x8_t vprod1x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va1, 5));
+      vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc5));
+      vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc5));
+      const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va0, 6));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c6));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c6));
+      const int16x8_t vprod1x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va1, 6));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c6));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c6));
+      const int8x8_t vb89ABCDEFc6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va0, 6));
+      vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc6));
+      vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc6));
+      const int16x8_t vprod1x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va1, 6));
+      vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc6));
+      vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc6));
+      const int8x8_t vb01234567c7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va0, 7));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c7));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c7));
+      const int16x8_t vprod1x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va1, 7));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c7));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c7));
+      const int8x8_t vb89ABCDEFc7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x89ABCDEFc7 = vmull_s8(vb89ABCDEFc7, vdup_lane_s8(va0, 7));
+      vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc7));
+      vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc7));
+      const int16x8_t vprod1x89ABCDEFc7 = vmull_s8(vb89ABCDEFc7, vdup_lane_s8(va1, 7));
+      vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc7));
+      vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc7));
+
+      k -= 8 * sizeof(int8_t);
+    }
+    if XNN_UNLIKELY(k != 0) {
+      const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+      const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+
+      const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c0));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c0));
+      const int16x8_t vprod0x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va0, 0));
+      vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc0));
+      vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc0));
+      const int16x8_t vprod1x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va1, 0));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c0));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c0));
+      const int16x8_t vprod1x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va1, 0));
+      vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc0));
+      vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc0));
+
+      if (k >= 2 * sizeof(int8_t)) {
+        const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c1));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c1));
+        const int16x8_t vprod0x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va0, 1));
+        vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc1));
+        vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc1));
+        const int16x8_t vprod1x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va1, 1));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c1));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c1));
+        const int16x8_t vprod1x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va1, 1));
+        vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc1));
+        vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc1));
+
+        if (k > 2 * sizeof(int8_t)) {
+          const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb89ABCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          const int16x8_t vprod0x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va0, 2));
+          vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c2));
+          vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c2));
+          const int16x8_t vprod0x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va0, 2));
+          vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc2));
+          vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc2));
+          const int16x8_t vprod1x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va1, 2));
+          vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c2));
+          vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c2));
+          const int16x8_t vprod1x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va1, 2));
+          vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc2));
+          vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc2));
+
+          if (k >= 4 * sizeof(int8_t)) {
+            const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            const int8x8_t vb89ABCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+            const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3));
+            vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3));
+            vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3));
+            const int16x8_t vprod0x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va0, 3));
+            vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc3));
+            vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc3));
+            const int16x8_t vprod1x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va1, 3));
+            vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c3));
+            vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c3));
+            const int16x8_t vprod1x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va1, 3));
+            vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc3));
+            vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc3));
+
+            if (k > 4 * sizeof(int8_t)) {
+              const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+              const int8x8_t vb89ABCDEFc4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+              const int16x8_t vprod0x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va0, 4));
+              vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c4));
+              vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c4));
+              const int16x8_t vprod0x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va0, 4));
+              vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc4));
+              vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc4));
+              const int16x8_t vprod1x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va1, 4));
+              vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c4));
+              vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c4));
+              const int16x8_t vprod1x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va1, 4));
+              vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc4));
+              vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc4));
+
+              if (k >= 6 * sizeof(int8_t)) {
+                const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+                const int8x8_t vb89ABCDEFc5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+                const int16x8_t vprod0x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va0, 5));
+                vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c5));
+                vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c5));
+                const int16x8_t vprod0x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va0, 5));
+                vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc5));
+                vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc5));
+                const int16x8_t vprod1x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va1, 5));
+                vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c5));
+                vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c5));
+                const int16x8_t vprod1x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va1, 5));
+                vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc5));
+                vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc5));
+
+                if (k > 6 * sizeof(int8_t)) {
+                  const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+                  const int8x8_t vb89ABCDEFc6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+                  const int16x8_t vprod0x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va0, 6));
+                  vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c6));
+                  vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c6));
+                  const int16x8_t vprod0x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va0, 6));
+                  vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc6));
+                  vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc6));
+                  const int16x8_t vprod1x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va1, 6));
+                  vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c6));
+                  vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c6));
+                  const int16x8_t vprod1x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va1, 6));
+                  vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc6));
+                  vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc6));
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+    vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc1x89AB = vqrdmulhq_s32(vacc1x89AB, vmultiplier);
+    vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+    vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc1x89AB = vsraq_n_s32(vacc1x89AB, vbicq_s32(vacc1x89AB, vzero_shift_mask), 31);
+    vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_shift);
+    vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+    int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+    int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+    vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
+
+    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+    vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
+
+    if (nc >= 16) {
+      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+      vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
+      nc -= 16;
+    } else {
+      int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
+      if (nc & 8) {
+        vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
+        vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
+        vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
+      }
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/2x16c16-minmax-neon-mlal-padal.c b/src/qs8-gemm/gen/2x16c16-minmax-neon-mlal-padal.c
new file mode 100644
index 0000000..bab89e5
--- /dev/null
+++ b/src/qs8-gemm/gen/2x16c16-minmax-neon-mlal-padal.c
@@ -0,0 +1,382 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/c16-neon-mlal-padal.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 16);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+
+  do {
+    int32x4_t vacc0x0 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x1 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x2 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x3 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x4 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x5 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x8 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x9 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x10 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x11 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x12 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x13 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x14 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x15 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc1x0 = vacc0x0;
+    int32x4_t vacc1x1 = vacc0x1;
+    int32x4_t vacc1x2 = vacc0x2;
+    int32x4_t vacc1x3 = vacc0x3;
+    int32x4_t vacc1x4 = vacc0x4;
+    int32x4_t vacc1x5 = vacc0x5;
+    int32x4_t vacc1x6 = vacc0x6;
+    int32x4_t vacc1x7 = vacc0x7;
+    int32x4_t vacc1x8 = vacc0x8;
+    int32x4_t vacc1x9 = vacc0x9;
+    int32x4_t vacc1x10 = vacc0x10;
+    int32x4_t vacc1x11 = vacc0x11;
+    int32x4_t vacc1x12 = vacc0x12;
+    int32x4_t vacc1x13 = vacc0x13;
+    int32x4_t vacc1x14 = vacc0x14;
+    int32x4_t vacc1x15 = vacc0x15;
+
+    // KC loop of 16
+    size_t k = 0;
+    while (k < kc) {
+      const int8x16_t va0 = vld1q_s8(a0); a0 += 16;
+      const int8x16_t va1 = vld1q_s8(a1); a1 += 16;
+
+      const int8x16_t vb0 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb1 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb2 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb3 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb4 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb5 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb6 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb7 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb8 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb9 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb10 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb11 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb12 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb13 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb14 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb15 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+
+      int16x8_t vprod0x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va0));
+      int16x8_t vprod1x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va1));
+      vprod0x0 = vmlal_s8(vprod0x0, vget_high_s8(vb0), vget_high_s8(va0));
+      vprod1x0 = vmlal_s8(vprod1x0, vget_high_s8(vb0), vget_high_s8(va1));
+      vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+      vacc1x0 = vpadalq_s16(vacc1x0, vprod1x0);
+      int16x8_t vprod0x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va0));
+      int16x8_t vprod1x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va1));
+      vprod0x1 = vmlal_s8(vprod0x1, vget_high_s8(vb1), vget_high_s8(va0));
+      vprod1x1 = vmlal_s8(vprod1x1, vget_high_s8(vb1), vget_high_s8(va1));
+      vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+      vacc1x1 = vpadalq_s16(vacc1x1, vprod1x1);
+      int16x8_t vprod0x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va0));
+      int16x8_t vprod1x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va1));
+      vprod0x2 = vmlal_s8(vprod0x2, vget_high_s8(vb2), vget_high_s8(va0));
+      vprod1x2 = vmlal_s8(vprod1x2, vget_high_s8(vb2), vget_high_s8(va1));
+      vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+      vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2);
+      int16x8_t vprod0x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va0));
+      int16x8_t vprod1x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va1));
+      vprod0x3 = vmlal_s8(vprod0x3, vget_high_s8(vb3), vget_high_s8(va0));
+      vprod1x3 = vmlal_s8(vprod1x3, vget_high_s8(vb3), vget_high_s8(va1));
+      vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+      vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3);
+      int16x8_t vprod0x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va0));
+      int16x8_t vprod1x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va1));
+      vprod0x4 = vmlal_s8(vprod0x4, vget_high_s8(vb4), vget_high_s8(va0));
+      vprod1x4 = vmlal_s8(vprod1x4, vget_high_s8(vb4), vget_high_s8(va1));
+      vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+      vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4);
+      int16x8_t vprod0x5 = vmull_s8(vget_low_s8(vb5), vget_low_s8(va0));
+      int16x8_t vprod1x5 = vmull_s8(vget_low_s8(vb5), vget_low_s8(va1));
+      vprod0x5 = vmlal_s8(vprod0x5, vget_high_s8(vb5), vget_high_s8(va0));
+      vprod1x5 = vmlal_s8(vprod1x5, vget_high_s8(vb5), vget_high_s8(va1));
+      vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+      vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5);
+      int16x8_t vprod0x6 = vmull_s8(vget_low_s8(vb6), vget_low_s8(va0));
+      int16x8_t vprod1x6 = vmull_s8(vget_low_s8(vb6), vget_low_s8(va1));
+      vprod0x6 = vmlal_s8(vprod0x6, vget_high_s8(vb6), vget_high_s8(va0));
+      vprod1x6 = vmlal_s8(vprod1x6, vget_high_s8(vb6), vget_high_s8(va1));
+      vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+      vacc1x6 = vpadalq_s16(vacc1x6, vprod1x6);
+      int16x8_t vprod0x7 = vmull_s8(vget_low_s8(vb7), vget_low_s8(va0));
+      int16x8_t vprod1x7 = vmull_s8(vget_low_s8(vb7), vget_low_s8(va1));
+      vprod0x7 = vmlal_s8(vprod0x7, vget_high_s8(vb7), vget_high_s8(va0));
+      vprod1x7 = vmlal_s8(vprod1x7, vget_high_s8(vb7), vget_high_s8(va1));
+      vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+      vacc1x7 = vpadalq_s16(vacc1x7, vprod1x7);
+      int16x8_t vprod0x8 = vmull_s8(vget_low_s8(vb8), vget_low_s8(va0));
+      int16x8_t vprod1x8 = vmull_s8(vget_low_s8(vb8), vget_low_s8(va1));
+      vprod0x8 = vmlal_s8(vprod0x8, vget_high_s8(vb8), vget_high_s8(va0));
+      vprod1x8 = vmlal_s8(vprod1x8, vget_high_s8(vb8), vget_high_s8(va1));
+      vacc0x8 = vpadalq_s16(vacc0x8, vprod0x8);
+      vacc1x8 = vpadalq_s16(vacc1x8, vprod1x8);
+      int16x8_t vprod0x9 = vmull_s8(vget_low_s8(vb9), vget_low_s8(va0));
+      int16x8_t vprod1x9 = vmull_s8(vget_low_s8(vb9), vget_low_s8(va1));
+      vprod0x9 = vmlal_s8(vprod0x9, vget_high_s8(vb9), vget_high_s8(va0));
+      vprod1x9 = vmlal_s8(vprod1x9, vget_high_s8(vb9), vget_high_s8(va1));
+      vacc0x9 = vpadalq_s16(vacc0x9, vprod0x9);
+      vacc1x9 = vpadalq_s16(vacc1x9, vprod1x9);
+      int16x8_t vprod0x10 = vmull_s8(vget_low_s8(vb10), vget_low_s8(va0));
+      int16x8_t vprod1x10 = vmull_s8(vget_low_s8(vb10), vget_low_s8(va1));
+      vprod0x10 = vmlal_s8(vprod0x10, vget_high_s8(vb10), vget_high_s8(va0));
+      vprod1x10 = vmlal_s8(vprod1x10, vget_high_s8(vb10), vget_high_s8(va1));
+      vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10);
+      vacc1x10 = vpadalq_s16(vacc1x10, vprod1x10);
+      int16x8_t vprod0x11 = vmull_s8(vget_low_s8(vb11), vget_low_s8(va0));
+      int16x8_t vprod1x11 = vmull_s8(vget_low_s8(vb11), vget_low_s8(va1));
+      vprod0x11 = vmlal_s8(vprod0x11, vget_high_s8(vb11), vget_high_s8(va0));
+      vprod1x11 = vmlal_s8(vprod1x11, vget_high_s8(vb11), vget_high_s8(va1));
+      vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11);
+      vacc1x11 = vpadalq_s16(vacc1x11, vprod1x11);
+      int16x8_t vprod0x12 = vmull_s8(vget_low_s8(vb12), vget_low_s8(va0));
+      int16x8_t vprod1x12 = vmull_s8(vget_low_s8(vb12), vget_low_s8(va1));
+      vprod0x12 = vmlal_s8(vprod0x12, vget_high_s8(vb12), vget_high_s8(va0));
+      vprod1x12 = vmlal_s8(vprod1x12, vget_high_s8(vb12), vget_high_s8(va1));
+      vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12);
+      vacc1x12 = vpadalq_s16(vacc1x12, vprod1x12);
+      int16x8_t vprod0x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va0));
+      int16x8_t vprod1x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va1));
+      vprod0x13 = vmlal_s8(vprod0x13, vget_high_s8(vb13), vget_high_s8(va0));
+      vprod1x13 = vmlal_s8(vprod1x13, vget_high_s8(vb13), vget_high_s8(va1));
+      vacc0x13 = vpadalq_s16(vacc0x13, vprod0x13);
+      vacc1x13 = vpadalq_s16(vacc1x13, vprod1x13);
+      int16x8_t vprod0x14 = vmull_s8(vget_low_s8(vb14), vget_low_s8(va0));
+      int16x8_t vprod1x14 = vmull_s8(vget_low_s8(vb14), vget_low_s8(va1));
+      vprod0x14 = vmlal_s8(vprod0x14, vget_high_s8(vb14), vget_high_s8(va0));
+      vprod1x14 = vmlal_s8(vprod1x14, vget_high_s8(vb14), vget_high_s8(va1));
+      vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14);
+      vacc1x14 = vpadalq_s16(vacc1x14, vprod1x14);
+      int16x8_t vprod0x15 = vmull_s8(vget_low_s8(vb15), vget_low_s8(va0));
+      int16x8_t vprod1x15 = vmull_s8(vget_low_s8(vb15), vget_low_s8(va1));
+      vprod0x15 = vmlal_s8(vprod0x15, vget_high_s8(vb15), vget_high_s8(va0));
+      vprod1x15 = vmlal_s8(vprod1x15, vget_high_s8(vb15), vget_high_s8(va1));
+      vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15);
+      vacc1x15 = vpadalq_s16(vacc1x15, vprod1x15);
+
+      k += 16 * sizeof(int8_t);
+    }
+
+#if XNN_ARCH_ARM64
+    const int32x4_t vsum0x01 = vpaddq_s32(vacc0x0, vacc0x1);
+    const int32x4_t vsum0x23 = vpaddq_s32(vacc0x2, vacc0x3);
+    const int32x4_t vsum0x45 = vpaddq_s32(vacc0x4, vacc0x5);
+    const int32x4_t vsum0x67 = vpaddq_s32(vacc0x6, vacc0x7);
+    const int32x4_t vsum0x89 = vpaddq_s32(vacc0x8, vacc0x9);
+    const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11);
+    const int32x4_t vsum0xCD = vpaddq_s32(vacc0x12, vacc0x13);
+    const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15);
+    const int32x4_t vsum1x01 = vpaddq_s32(vacc1x0, vacc1x1);
+    const int32x4_t vsum1x23 = vpaddq_s32(vacc1x2, vacc1x3);
+    const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5);
+    const int32x4_t vsum1x67 = vpaddq_s32(vacc1x6, vacc1x7);
+    const int32x4_t vsum1x89 = vpaddq_s32(vacc1x8, vacc1x9);
+    const int32x4_t vsum1xAB = vpaddq_s32(vacc1x10, vacc1x11);
+    const int32x4_t vsum1xCD = vpaddq_s32(vacc1x12, vacc1x13);
+    const int32x4_t vsum1xEF = vpaddq_s32(vacc1x14, vacc1x15);
+    int32x4_t vacc0x0123 = vpaddq_s32(vsum0x01, vsum0x23);
+    int32x4_t vacc0x4567 = vpaddq_s32(vsum0x45, vsum0x67);
+    int32x4_t vacc0x89AB = vpaddq_s32(vsum0x89, vsum0xAB);
+    int32x4_t vacc0xCDEF = vpaddq_s32(vsum0xCD, vsum0xEF);
+    int32x4_t vacc1x0123 = vpaddq_s32(vsum1x01, vsum1x23);
+    int32x4_t vacc1x4567 = vpaddq_s32(vsum1x45, vsum1x67);
+    int32x4_t vacc1x89AB = vpaddq_s32(vsum1x89, vsum1xAB);
+    int32x4_t vacc1xCDEF = vpaddq_s32(vsum1xCD, vsum1xEF);
+#else
+    const int32x2_t vpsum0x0 = vadd_s32(vget_low_s32(vacc0x0), vget_high_s32(vacc0x0));
+    const int32x2_t vpsum0x1 = vadd_s32(vget_low_s32(vacc0x1), vget_high_s32(vacc0x1));
+    const int32x2_t vpsum0x2 = vadd_s32(vget_low_s32(vacc0x2), vget_high_s32(vacc0x2));
+    const int32x2_t vpsum0x3 = vadd_s32(vget_low_s32(vacc0x3), vget_high_s32(vacc0x3));
+    const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1);
+    const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3);
+    int32x4_t vacc0x0123 = vcombine_s32(vsum0x01, vsum0x23 );
+    const int32x2_t vpsum0x4 = vadd_s32(vget_low_s32(vacc0x4), vget_high_s32(vacc0x4));
+    const int32x2_t vpsum0x5 = vadd_s32(vget_low_s32(vacc0x5), vget_high_s32(vacc0x5));
+    const int32x2_t vpsum0x6 = vadd_s32(vget_low_s32(vacc0x6), vget_high_s32(vacc0x6));
+    const int32x2_t vpsum0x7 = vadd_s32(vget_low_s32(vacc0x7), vget_high_s32(vacc0x7));
+    const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5);
+    const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7);
+    int32x4_t vacc0x4567 = vcombine_s32(vsum0x45, vsum0x67 );
+    const int32x2_t vpsum0x8 = vadd_s32(vget_low_s32(vacc0x8), vget_high_s32(vacc0x8));
+    const int32x2_t vpsum0x9 = vadd_s32(vget_low_s32(vacc0x9), vget_high_s32(vacc0x9));
+    const int32x2_t vpsum0xA = vadd_s32(vget_low_s32(vacc0x10), vget_high_s32(vacc0x10));
+    const int32x2_t vpsum0xB = vadd_s32(vget_low_s32(vacc0x11), vget_high_s32(vacc0x11));
+    const int32x2_t vsum0x89 = vpadd_s32(vpsum0x8, vpsum0x9);
+    const int32x2_t vsum0xAB = vpadd_s32(vpsum0xA, vpsum0xB);
+    int32x4_t vacc0x89AB = vcombine_s32(vsum0x89, vsum0xAB );
+    const int32x2_t vpsum0xC = vadd_s32(vget_low_s32(vacc0x12), vget_high_s32(vacc0x12));
+    const int32x2_t vpsum0xD = vadd_s32(vget_low_s32(vacc0x13), vget_high_s32(vacc0x13));
+    const int32x2_t vpsum0xE = vadd_s32(vget_low_s32(vacc0x14), vget_high_s32(vacc0x14));
+    const int32x2_t vpsum0xF = vadd_s32(vget_low_s32(vacc0x15), vget_high_s32(vacc0x15));
+    const int32x2_t vsum0xCD = vpadd_s32(vpsum0xC, vpsum0xD);
+    const int32x2_t vsum0xEF = vpadd_s32(vpsum0xE, vpsum0xF);
+    int32x4_t vacc0xCDEF = vcombine_s32(vsum0xCD, vsum0xEF );
+    const int32x2_t vpsum1x0 = vadd_s32(vget_low_s32(vacc1x0), vget_high_s32(vacc1x0));
+    const int32x2_t vpsum1x1 = vadd_s32(vget_low_s32(vacc1x1), vget_high_s32(vacc1x1));
+    const int32x2_t vpsum1x2 = vadd_s32(vget_low_s32(vacc1x2), vget_high_s32(vacc1x2));
+    const int32x2_t vpsum1x3 = vadd_s32(vget_low_s32(vacc1x3), vget_high_s32(vacc1x3));
+    const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1);
+    const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3);
+    int32x4_t vacc1x0123 = vcombine_s32(vsum1x01, vsum1x23 );
+    const int32x2_t vpsum1x4 = vadd_s32(vget_low_s32(vacc1x4), vget_high_s32(vacc1x4));
+    const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5));
+    const int32x2_t vpsum1x6 = vadd_s32(vget_low_s32(vacc1x6), vget_high_s32(vacc1x6));
+    const int32x2_t vpsum1x7 = vadd_s32(vget_low_s32(vacc1x7), vget_high_s32(vacc1x7));
+    const int32x2_t vsum1x45 = vpadd_s32(vpsum1x4, vpsum1x5);
+    const int32x2_t vsum1x67 = vpadd_s32(vpsum1x6, vpsum1x7);
+    int32x4_t vacc1x4567 = vcombine_s32(vsum1x45, vsum1x67 );
+    const int32x2_t vpsum1x8 = vadd_s32(vget_low_s32(vacc1x8), vget_high_s32(vacc1x8));
+    const int32x2_t vpsum1x9 = vadd_s32(vget_low_s32(vacc1x9), vget_high_s32(vacc1x9));
+    const int32x2_t vpsum1xA = vadd_s32(vget_low_s32(vacc1x10), vget_high_s32(vacc1x10));
+    const int32x2_t vpsum1xB = vadd_s32(vget_low_s32(vacc1x11), vget_high_s32(vacc1x11));
+    const int32x2_t vsum1x89 = vpadd_s32(vpsum1x8, vpsum1x9);
+    const int32x2_t vsum1xAB = vpadd_s32(vpsum1xA, vpsum1xB);
+    int32x4_t vacc1x89AB = vcombine_s32(vsum1x89, vsum1xAB );
+    const int32x2_t vpsum1xC = vadd_s32(vget_low_s32(vacc1x12), vget_high_s32(vacc1x12));
+    const int32x2_t vpsum1xD = vadd_s32(vget_low_s32(vacc1x13), vget_high_s32(vacc1x13));
+    const int32x2_t vpsum1xE = vadd_s32(vget_low_s32(vacc1x14), vget_high_s32(vacc1x14));
+    const int32x2_t vpsum1xF = vadd_s32(vget_low_s32(vacc1x15), vget_high_s32(vacc1x15));
+    const int32x2_t vsum1xCD = vpadd_s32(vpsum1xC, vpsum1xD);
+    const int32x2_t vsum1xEF = vpadd_s32(vpsum1xE, vpsum1xF);
+    int32x4_t vacc1xCDEF = vcombine_s32(vsum1xCD, vsum1xEF );
+#endif
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+    vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc1x89AB = vqrdmulhq_s32(vacc1x89AB, vmultiplier);
+    vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+    vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc1x89AB = vsraq_n_s32(vacc1x89AB, vbicq_s32(vacc1x89AB, vzero_shift_mask), 31);
+    vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_shift);
+    vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+    int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+    int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+    vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
+
+    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+    vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
+
+    if (nc >= 16) {
+      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+      vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
+      nc -= 16;
+    } else {
+      int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
+      if (nc & 8) {
+        vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
+        vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
+        vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
+      }
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/2x16c2-minmax-neon-mlal-padal-dup.c b/src/qs8-gemm/gen/2x16c2-minmax-neon-mlal-padal-dup.c
new file mode 100644
index 0000000..cbb3d94
--- /dev/null
+++ b/src/qs8-gemm/gen/2x16c2-minmax-neon-mlal-padal-dup.c
@@ -0,0 +1,456 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/c2-neon-mull-padal-dup.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc1x0123 = vacc0x0123;
+    int32x4_t vacc1x4567 = vacc0x4567;
+    int32x4_t vacc1x89AB = vacc0x89AB;
+    int32x4_t vacc1xCDEF = vacc0xCDEF;
+
+    size_t k = kc;
+
+    while (k >= 16 * sizeof(int8_t)) {
+      const int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va0x1 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va1x0 = vld1_s8(a1); a1 += 8;
+      const int8x8_t va1x1 = vld1_s8(a1); a1 += 8;
+
+      const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+      int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+      const int8x8_t vb0123c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+      vprod1x0123c0 = vmlal_s8(vprod1x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+      int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+      int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+      const int8x8_t vb4567c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+      vprod1x4567c0 = vmlal_s8(vprod1x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+      int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+      int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+      const int8x8_t vb89ABc0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x89ABc0 = vmlal_s8(vprod0x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+      vprod1x89ABc0 = vmlal_s8(vprod1x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+      vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
+      int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+      int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+      const int8x8_t vbCDEFc0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0xCDEFc0 = vmlal_s8(vprod0xCDEFc0, vbCDEFc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+      vprod1xCDEFc0 = vmlal_s8(vprod1xCDEFc0, vbCDEFc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+      vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
+      int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+      int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+      const int8x8_t vb0123c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x0123c1 = vmlal_s8(vprod0x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+      vprod1x0123c1 = vmlal_s8(vprod1x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+      int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+      int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+      const int8x8_t vb4567c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x4567c1 = vmlal_s8(vprod0x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+      vprod1x4567c1 = vmlal_s8(vprod1x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+      int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+      int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+      const int8x8_t vb89ABc1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x89ABc1 = vmlal_s8(vprod0x89ABc1, vb89ABc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+      vprod1x89ABc1 = vmlal_s8(vprod1x89ABc1, vb89ABc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+      vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
+      int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+      int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+      const int8x8_t vbCDEFc1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0xCDEFc1 = vmlal_s8(vprod0xCDEFc1, vbCDEFc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+      vprod1xCDEFc1 = vmlal_s8(vprod1xCDEFc1, vbCDEFc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+      vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
+      int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+      int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+      const int8x8_t vb0123c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x0123c2 = vmlal_s8(vprod0x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+      vprod1x0123c2 = vmlal_s8(vprod1x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+      int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+      int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+      const int8x8_t vb4567c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x4567c2 = vmlal_s8(vprod0x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+      vprod1x4567c2 = vmlal_s8(vprod1x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+      int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+      int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+      const int8x8_t vb89ABc2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x89ABc2 = vmlal_s8(vprod0x89ABc2, vb89ABc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+      vprod1x89ABc2 = vmlal_s8(vprod1x89ABc2, vb89ABc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+      vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
+      int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+      int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+      const int8x8_t vbCDEFc2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0xCDEFc2 = vmlal_s8(vprod0xCDEFc2, vbCDEFc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+      vprod1xCDEFc2 = vmlal_s8(vprod1xCDEFc2, vbCDEFc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+      vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
+      int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+      int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+      const int8x8_t vb0123c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x0123c3 = vmlal_s8(vprod0x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+      vprod1x0123c3 = vmlal_s8(vprod1x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+      int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+      int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+      const int8x8_t vb4567c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x4567c3 = vmlal_s8(vprod0x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+      vprod1x4567c3 = vmlal_s8(vprod1x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+      int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+      int16x8_t vprod1x89ABc3 = vmull_s8(vb89ABc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+      const int8x8_t vb89ABc3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x89ABc3 = vmlal_s8(vprod0x89ABc3, vb89ABc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+      vprod1x89ABc3 = vmlal_s8(vprod1x89ABc3, vb89ABc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
+      vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc3);
+      int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+      int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+      const int8x8_t vbCDEFc3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0xCDEFc3 = vmlal_s8(vprod0xCDEFc3, vbCDEFc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+      vprod1xCDEFc3 = vmlal_s8(vprod1xCDEFc3, vbCDEFc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
+      vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3);
+
+      k -= 16 * sizeof(int8_t);
+    }
+
+    if (k >= 8 * sizeof(int8_t)) {
+      const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+
+      const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+      const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+      const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+      const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+      const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+      const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+      const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+      const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+      const int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
+      const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+      const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+      const int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
+      const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+      const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+      const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+      const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+      const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+      const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+      const int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      const int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+      const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+      const int16x8_t vprod1x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+      vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
+      vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
+      vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
+      vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc3);
+      const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+      const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+      const int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+      vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
+      vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
+      vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
+      vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3);
+
+      k -= 8 * sizeof(int8_t);
+    }
+
+    if XNN_UNLIKELY(k != 0) {
+      const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+      const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+
+      const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+      const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+      const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+      const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+      const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+      const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+      const int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
+      const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
+
+      if (k > 2 * sizeof(int8_t)) {
+        const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+        const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+        const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+        const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+        const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+        const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+        const int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
+        const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
+
+        if (k > 4 * sizeof(int8_t)) {
+          const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+          vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+          const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+          vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+          const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+          vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+          const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+          vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+          const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+          vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+          const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+          vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+          const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+          vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
+          const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+          vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
+        }
+      }
+    }
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+    vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc1x89AB = vqrdmulhq_s32(vacc1x89AB, vmultiplier);
+    vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+    vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc1x89AB = vsraq_n_s32(vacc1x89AB, vbicq_s32(vacc1x89AB, vzero_shift_mask), 31);
+    vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_shift);
+    vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+    int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+    int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+    vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
+
+    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+    vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
+
+    if (nc >= 16) {
+      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+      vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
+      nc -= 16;
+    } else {
+      int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
+      if (nc & 8) {
+        vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
+        vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
+        vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
+      }
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/2x16c2-minmax-neon-mull-padal-dup.c b/src/qs8-gemm/gen/2x16c2-minmax-neon-mull-padal-dup.c
new file mode 100644
index 0000000..3cc7e4e
--- /dev/null
+++ b/src/qs8-gemm/gen/2x16c2-minmax-neon-mull-padal-dup.c
@@ -0,0 +1,318 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/c2-neon-mull-padal-dup.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc1x0123 = vacc0x0123;
+    int32x4_t vacc1x4567 = vacc0x4567;
+    int32x4_t vacc1x89AB = vacc0x89AB;
+    int32x4_t vacc1xCDEF = vacc0xCDEF;
+
+    size_t k = kc;
+
+
+    while (k >= 8 * sizeof(int8_t)) {
+      const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+
+      const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+      const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+      const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+      const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+      const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+      const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+      const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+      const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+      const int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
+      const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+      const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+      const int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
+      const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+      const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+      const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+      const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+      const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+      const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+      const int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      const int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+      const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+      const int16x8_t vprod1x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+      vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
+      vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
+      vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
+      vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc3);
+      const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+      const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+      const int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+      vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
+      vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
+      vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
+      vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3);
+
+      k -= 8 * sizeof(int8_t);
+    }
+
+    if XNN_UNLIKELY(k != 0) {
+      const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+      const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+
+      const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+      const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+      const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+      const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+      const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+      const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+      const int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
+      const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
+
+      if (k > 2 * sizeof(int8_t)) {
+        const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+        const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+        const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+        const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+        const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+        const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+        const int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
+        const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
+
+        if (k > 4 * sizeof(int8_t)) {
+          const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+          vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+          const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+          vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+          const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+          vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+          const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+          vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+          const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+          vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+          const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+          vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+          const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+          vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
+          const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+          vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
+        }
+      }
+    }
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+    vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc1x89AB = vqrdmulhq_s32(vacc1x89AB, vmultiplier);
+    vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+    vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc1x89AB = vsraq_n_s32(vacc1x89AB, vbicq_s32(vacc1x89AB, vzero_shift_mask), 31);
+    vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_shift);
+    vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+    int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+    int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+    vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
+
+    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+    vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
+
+    if (nc >= 16) {
+      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+      vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
+      nc -= 16;
+    } else {
+      int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
+      if (nc & 8) {
+        vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
+        vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
+        vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
+      }
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/2x16c8-minmax-avx512skx.c b/src/qs8-gemm/gen/2x16c8-minmax-avx512skx.c
index bfc594b..1e18779 100644
--- a/src/qs8-gemm/gen/2x16c8-minmax-avx512skx.c
+++ b/src/qs8-gemm/gen/2x16c8-minmax-avx512skx.c
@@ -13,6 +13,7 @@
 
 #include <xnnpack/gemm.h>
 #include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx(
@@ -36,6 +37,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -145,12 +147,12 @@
       _mm_storeu_si128((__m128i*) c0, _mm256_castsi256_si128(vout01x0123456789ABCDEF));
       _mm_storeu_si128((__m128i*) c1, _mm256_extracti128_si256(vout01x0123456789ABCDEF, 1));
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
       nc -= 16;
     } else {
       // Prepare mask for valid 8-bit elements (depends on nc).
diff --git a/src/qs8-gemm/gen/2x16c8-minmax-neon-mlal-padal.c b/src/qs8-gemm/gen/2x16c8-minmax-neon-mlal-padal.c
new file mode 100644
index 0000000..ed98e66
--- /dev/null
+++ b/src/qs8-gemm/gen/2x16c8-minmax-neon-mlal-padal.c
@@ -0,0 +1,489 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/c8-neon-mull-padal.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+
+  do {
+    int32x4_t vacc0x0 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x1 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x2 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x3 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x4 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x5 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x8 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x9 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x10 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x11 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x12 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x13 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x14 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x15 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc1x0 = vacc0x0;
+    int32x4_t vacc1x1 = vacc0x1;
+    int32x4_t vacc1x2 = vacc0x2;
+    int32x4_t vacc1x3 = vacc0x3;
+    int32x4_t vacc1x4 = vacc0x4;
+    int32x4_t vacc1x5 = vacc0x5;
+    int32x4_t vacc1x6 = vacc0x6;
+    int32x4_t vacc1x7 = vacc0x7;
+    int32x4_t vacc1x8 = vacc0x8;
+    int32x4_t vacc1x9 = vacc0x9;
+    int32x4_t vacc1x10 = vacc0x10;
+    int32x4_t vacc1x11 = vacc0x11;
+    int32x4_t vacc1x12 = vacc0x12;
+    int32x4_t vacc1x13 = vacc0x13;
+    int32x4_t vacc1x14 = vacc0x14;
+    int32x4_t vacc1x15 = vacc0x15;
+
+    size_t k = kc;
+    // 2x partial unrolled loop to load 16 bytes at a time using MLA.
+    while (k >= 16 * sizeof(int8_t)) {
+      const int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va0x1 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va1x0 = vld1_s8(a1); a1 += 8;
+      const int8x8_t va1x1 = vld1_s8(a1); a1 += 8;
+
+      const int8x8_t vb0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb4x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb5x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb6x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb7x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb8x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb9x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb10x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb11x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb12x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb13x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb14x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb15x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+
+      const int8x8_t vb0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x0 = vmull_s8(vb0x0, va0x0);
+      int16x8_t vprod1x0 = vmull_s8(vb0x0, va1x0);
+      vprod0x0 = vmlal_s8(vprod0x0, vb0x1, va0x1);
+      vprod1x0 = vmlal_s8(vprod1x0, vb0x1, va1x1);
+      vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+      vacc1x0 = vpadalq_s16(vacc1x0, vprod1x0);
+      const int8x8_t vb1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x1 = vmull_s8(vb1x0, va0x0);
+      int16x8_t vprod1x1 = vmull_s8(vb1x0, va1x0);
+      vprod0x1 = vmlal_s8(vprod0x1, vb1x1, va0x1);
+      vprod1x1 = vmlal_s8(vprod1x1, vb1x1, va1x1);
+      vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+      vacc1x1 = vpadalq_s16(vacc1x1, vprod1x1);
+      const int8x8_t vb2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x2 = vmull_s8(vb2x0, va0x0);
+      int16x8_t vprod1x2 = vmull_s8(vb2x0, va1x0);
+      vprod0x2 = vmlal_s8(vprod0x2, vb2x1, va0x1);
+      vprod1x2 = vmlal_s8(vprod1x2, vb2x1, va1x1);
+      vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+      vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2);
+      const int8x8_t vb3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x3 = vmull_s8(vb3x0, va0x0);
+      int16x8_t vprod1x3 = vmull_s8(vb3x0, va1x0);
+      vprod0x3 = vmlal_s8(vprod0x3, vb3x1, va0x1);
+      vprod1x3 = vmlal_s8(vprod1x3, vb3x1, va1x1);
+      vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+      vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3);
+      const int8x8_t vb4x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x4 = vmull_s8(vb4x0, va0x0);
+      int16x8_t vprod1x4 = vmull_s8(vb4x0, va1x0);
+      vprod0x4 = vmlal_s8(vprod0x4, vb4x1, va0x1);
+      vprod1x4 = vmlal_s8(vprod1x4, vb4x1, va1x1);
+      vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+      vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4);
+      const int8x8_t vb5x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x5 = vmull_s8(vb5x0, va0x0);
+      int16x8_t vprod1x5 = vmull_s8(vb5x0, va1x0);
+      vprod0x5 = vmlal_s8(vprod0x5, vb5x1, va0x1);
+      vprod1x5 = vmlal_s8(vprod1x5, vb5x1, va1x1);
+      vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+      vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5);
+      const int8x8_t vb6x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x6 = vmull_s8(vb6x0, va0x0);
+      int16x8_t vprod1x6 = vmull_s8(vb6x0, va1x0);
+      vprod0x6 = vmlal_s8(vprod0x6, vb6x1, va0x1);
+      vprod1x6 = vmlal_s8(vprod1x6, vb6x1, va1x1);
+      vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+      vacc1x6 = vpadalq_s16(vacc1x6, vprod1x6);
+      const int8x8_t vb7x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x7 = vmull_s8(vb7x0, va0x0);
+      int16x8_t vprod1x7 = vmull_s8(vb7x0, va1x0);
+      vprod0x7 = vmlal_s8(vprod0x7, vb7x1, va0x1);
+      vprod1x7 = vmlal_s8(vprod1x7, vb7x1, va1x1);
+      vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+      vacc1x7 = vpadalq_s16(vacc1x7, vprod1x7);
+      const int8x8_t vb8x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x8 = vmull_s8(vb8x0, va0x0);
+      int16x8_t vprod1x8 = vmull_s8(vb8x0, va1x0);
+      vprod0x8 = vmlal_s8(vprod0x8, vb8x1, va0x1);
+      vprod1x8 = vmlal_s8(vprod1x8, vb8x1, va1x1);
+      vacc0x8 = vpadalq_s16(vacc0x8, vprod0x8);
+      vacc1x8 = vpadalq_s16(vacc1x8, vprod1x8);
+      const int8x8_t vb9x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x9 = vmull_s8(vb9x0, va0x0);
+      int16x8_t vprod1x9 = vmull_s8(vb9x0, va1x0);
+      vprod0x9 = vmlal_s8(vprod0x9, vb9x1, va0x1);
+      vprod1x9 = vmlal_s8(vprod1x9, vb9x1, va1x1);
+      vacc0x9 = vpadalq_s16(vacc0x9, vprod0x9);
+      vacc1x9 = vpadalq_s16(vacc1x9, vprod1x9);
+      const int8x8_t vb10x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x10 = vmull_s8(vb10x0, va0x0);
+      int16x8_t vprod1x10 = vmull_s8(vb10x0, va1x0);
+      vprod0x10 = vmlal_s8(vprod0x10, vb10x1, va0x1);
+      vprod1x10 = vmlal_s8(vprod1x10, vb10x1, va1x1);
+      vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10);
+      vacc1x10 = vpadalq_s16(vacc1x10, vprod1x10);
+      const int8x8_t vb11x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x11 = vmull_s8(vb11x0, va0x0);
+      int16x8_t vprod1x11 = vmull_s8(vb11x0, va1x0);
+      vprod0x11 = vmlal_s8(vprod0x11, vb11x1, va0x1);
+      vprod1x11 = vmlal_s8(vprod1x11, vb11x1, va1x1);
+      vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11);
+      vacc1x11 = vpadalq_s16(vacc1x11, vprod1x11);
+      const int8x8_t vb12x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x12 = vmull_s8(vb12x0, va0x0);
+      int16x8_t vprod1x12 = vmull_s8(vb12x0, va1x0);
+      vprod0x12 = vmlal_s8(vprod0x12, vb12x1, va0x1);
+      vprod1x12 = vmlal_s8(vprod1x12, vb12x1, va1x1);
+      vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12);
+      vacc1x12 = vpadalq_s16(vacc1x12, vprod1x12);
+      const int8x8_t vb13x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x13 = vmull_s8(vb13x0, va0x0);
+      int16x8_t vprod1x13 = vmull_s8(vb13x0, va1x0);
+      vprod0x13 = vmlal_s8(vprod0x13, vb13x1, va0x1);
+      vprod1x13 = vmlal_s8(vprod1x13, vb13x1, va1x1);
+      vacc0x13 = vpadalq_s16(vacc0x13, vprod0x13);
+      vacc1x13 = vpadalq_s16(vacc1x13, vprod1x13);
+      const int8x8_t vb14x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x14 = vmull_s8(vb14x0, va0x0);
+      int16x8_t vprod1x14 = vmull_s8(vb14x0, va1x0);
+      vprod0x14 = vmlal_s8(vprod0x14, vb14x1, va0x1);
+      vprod1x14 = vmlal_s8(vprod1x14, vb14x1, va1x1);
+      vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14);
+      vacc1x14 = vpadalq_s16(vacc1x14, vprod1x14);
+      const int8x8_t vb15x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x15 = vmull_s8(vb15x0, va0x0);
+      int16x8_t vprod1x15 = vmull_s8(vb15x0, va1x0);
+      vprod0x15 = vmlal_s8(vprod0x15, vb15x1, va0x1);
+      vprod1x15 = vmlal_s8(vprod1x15, vb15x1, va1x1);
+      vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15);
+      vacc1x15 = vpadalq_s16(vacc1x15, vprod1x15);
+
+      k -= 16 * sizeof(int8_t);
+    }
+
+    // Handle 8 bytes at a time using MUL.
+    if (k > 0) {
+      const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+
+      const int8x8_t vb0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x0 = vmull_s8(vb0, va0);
+      const int16x8_t vprod1x0 = vmull_s8(vb0, va1);
+      vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+      vacc1x0 = vpadalq_s16(vacc1x0, vprod1x0);
+      const int8x8_t vb1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x1 = vmull_s8(vb1, va0);
+      const int16x8_t vprod1x1 = vmull_s8(vb1, va1);
+      vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+      vacc1x1 = vpadalq_s16(vacc1x1, vprod1x1);
+      const int8x8_t vb2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x2 = vmull_s8(vb2, va0);
+      const int16x8_t vprod1x2 = vmull_s8(vb2, va1);
+      vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+      vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2);
+      const int8x8_t vb3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x3 = vmull_s8(vb3, va0);
+      const int16x8_t vprod1x3 = vmull_s8(vb3, va1);
+      vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+      vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3);
+      const int8x8_t vb4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x4 = vmull_s8(vb4, va0);
+      const int16x8_t vprod1x4 = vmull_s8(vb4, va1);
+      vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+      vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4);
+      const int8x8_t vb5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x5 = vmull_s8(vb5, va0);
+      const int16x8_t vprod1x5 = vmull_s8(vb5, va1);
+      vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+      vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5);
+      const int8x8_t vb6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x6 = vmull_s8(vb6, va0);
+      const int16x8_t vprod1x6 = vmull_s8(vb6, va1);
+      vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+      vacc1x6 = vpadalq_s16(vacc1x6, vprod1x6);
+      const int8x8_t vb7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x7 = vmull_s8(vb7, va0);
+      const int16x8_t vprod1x7 = vmull_s8(vb7, va1);
+      vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+      vacc1x7 = vpadalq_s16(vacc1x7, vprod1x7);
+      const int8x8_t vb8 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x8 = vmull_s8(vb8, va0);
+      const int16x8_t vprod1x8 = vmull_s8(vb8, va1);
+      vacc0x8 = vpadalq_s16(vacc0x8, vprod0x8);
+      vacc1x8 = vpadalq_s16(vacc1x8, vprod1x8);
+      const int8x8_t vb9 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x9 = vmull_s8(vb9, va0);
+      const int16x8_t vprod1x9 = vmull_s8(vb9, va1);
+      vacc0x9 = vpadalq_s16(vacc0x9, vprod0x9);
+      vacc1x9 = vpadalq_s16(vacc1x9, vprod1x9);
+      const int8x8_t vb10 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x10 = vmull_s8(vb10, va0);
+      const int16x8_t vprod1x10 = vmull_s8(vb10, va1);
+      vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10);
+      vacc1x10 = vpadalq_s16(vacc1x10, vprod1x10);
+      const int8x8_t vb11 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x11 = vmull_s8(vb11, va0);
+      const int16x8_t vprod1x11 = vmull_s8(vb11, va1);
+      vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11);
+      vacc1x11 = vpadalq_s16(vacc1x11, vprod1x11);
+      const int8x8_t vb12 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x12 = vmull_s8(vb12, va0);
+      const int16x8_t vprod1x12 = vmull_s8(vb12, va1);
+      vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12);
+      vacc1x12 = vpadalq_s16(vacc1x12, vprod1x12);
+      const int8x8_t vb13 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x13 = vmull_s8(vb13, va0);
+      const int16x8_t vprod1x13 = vmull_s8(vb13, va1);
+      vacc0x13 = vpadalq_s16(vacc0x13, vprod0x13);
+      vacc1x13 = vpadalq_s16(vacc1x13, vprod1x13);
+      const int8x8_t vb14 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x14 = vmull_s8(vb14, va0);
+      const int16x8_t vprod1x14 = vmull_s8(vb14, va1);
+      vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14);
+      vacc1x14 = vpadalq_s16(vacc1x14, vprod1x14);
+      const int8x8_t vb15 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x15 = vmull_s8(vb15, va0);
+      const int16x8_t vprod1x15 = vmull_s8(vb15, va1);
+      vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15);
+      vacc1x15 = vpadalq_s16(vacc1x15, vprod1x15);
+
+      k -= 8 * sizeof(int8_t);
+    }
+
+#if XNN_ARCH_ARM64
+    const int32x4_t vsum0x01 = vpaddq_s32(vacc0x0, vacc0x1);
+    const int32x4_t vsum0x23 = vpaddq_s32(vacc0x2, vacc0x3);
+    const int32x4_t vsum0x45 = vpaddq_s32(vacc0x4, vacc0x5);
+    const int32x4_t vsum0x67 = vpaddq_s32(vacc0x6, vacc0x7);
+    const int32x4_t vsum0x89 = vpaddq_s32(vacc0x8, vacc0x9);
+    const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11);
+    const int32x4_t vsum0xCD = vpaddq_s32(vacc0x12, vacc0x13);
+    const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15);
+    const int32x4_t vsum1x01 = vpaddq_s32(vacc1x0, vacc1x1);
+    const int32x4_t vsum1x23 = vpaddq_s32(vacc1x2, vacc1x3);
+    const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5);
+    const int32x4_t vsum1x67 = vpaddq_s32(vacc1x6, vacc1x7);
+    const int32x4_t vsum1x89 = vpaddq_s32(vacc1x8, vacc1x9);
+    const int32x4_t vsum1xAB = vpaddq_s32(vacc1x10, vacc1x11);
+    const int32x4_t vsum1xCD = vpaddq_s32(vacc1x12, vacc1x13);
+    const int32x4_t vsum1xEF = vpaddq_s32(vacc1x14, vacc1x15);
+    int32x4_t vacc0x0123 = vpaddq_s32(vsum0x01, vsum0x23);
+    int32x4_t vacc0x4567 = vpaddq_s32(vsum0x45, vsum0x67);
+    int32x4_t vacc0x89AB = vpaddq_s32(vsum0x89, vsum0xAB);
+    int32x4_t vacc0xCDEF = vpaddq_s32(vsum0xCD, vsum0xEF);
+    int32x4_t vacc1x0123 = vpaddq_s32(vsum1x01, vsum1x23);
+    int32x4_t vacc1x4567 = vpaddq_s32(vsum1x45, vsum1x67);
+    int32x4_t vacc1x89AB = vpaddq_s32(vsum1x89, vsum1xAB);
+    int32x4_t vacc1xCDEF = vpaddq_s32(vsum1xCD, vsum1xEF);
+#else
+    const int32x2_t vpsum0x0 = vadd_s32(vget_low_s32(vacc0x0), vget_high_s32(vacc0x0));
+    const int32x2_t vpsum0x1 = vadd_s32(vget_low_s32(vacc0x1), vget_high_s32(vacc0x1));
+    const int32x2_t vpsum0x2 = vadd_s32(vget_low_s32(vacc0x2), vget_high_s32(vacc0x2));
+    const int32x2_t vpsum0x3 = vadd_s32(vget_low_s32(vacc0x3), vget_high_s32(vacc0x3));
+    const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1);
+    const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3);
+    int32x4_t vacc0x0123 = vcombine_s32(vsum0x01, vsum0x23 );
+    const int32x2_t vpsum0x4 = vadd_s32(vget_low_s32(vacc0x4), vget_high_s32(vacc0x4));
+    const int32x2_t vpsum0x5 = vadd_s32(vget_low_s32(vacc0x5), vget_high_s32(vacc0x5));
+    const int32x2_t vpsum0x6 = vadd_s32(vget_low_s32(vacc0x6), vget_high_s32(vacc0x6));
+    const int32x2_t vpsum0x7 = vadd_s32(vget_low_s32(vacc0x7), vget_high_s32(vacc0x7));
+    const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5);
+    const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7);
+    int32x4_t vacc0x4567 = vcombine_s32(vsum0x45, vsum0x67 );
+    const int32x2_t vpsum0x8 = vadd_s32(vget_low_s32(vacc0x8), vget_high_s32(vacc0x8));
+    const int32x2_t vpsum0x9 = vadd_s32(vget_low_s32(vacc0x9), vget_high_s32(vacc0x9));
+    const int32x2_t vpsum0xA = vadd_s32(vget_low_s32(vacc0x10), vget_high_s32(vacc0x10));
+    const int32x2_t vpsum0xB = vadd_s32(vget_low_s32(vacc0x11), vget_high_s32(vacc0x11));
+    const int32x2_t vsum0x89 = vpadd_s32(vpsum0x8, vpsum0x9);
+    const int32x2_t vsum0xAB = vpadd_s32(vpsum0xA, vpsum0xB);
+    int32x4_t vacc0x89AB = vcombine_s32(vsum0x89, vsum0xAB );
+    const int32x2_t vpsum0xC = vadd_s32(vget_low_s32(vacc0x12), vget_high_s32(vacc0x12));
+    const int32x2_t vpsum0xD = vadd_s32(vget_low_s32(vacc0x13), vget_high_s32(vacc0x13));
+    const int32x2_t vpsum0xE = vadd_s32(vget_low_s32(vacc0x14), vget_high_s32(vacc0x14));
+    const int32x2_t vpsum0xF = vadd_s32(vget_low_s32(vacc0x15), vget_high_s32(vacc0x15));
+    const int32x2_t vsum0xCD = vpadd_s32(vpsum0xC, vpsum0xD);
+    const int32x2_t vsum0xEF = vpadd_s32(vpsum0xE, vpsum0xF);
+    int32x4_t vacc0xCDEF = vcombine_s32(vsum0xCD, vsum0xEF );
+    const int32x2_t vpsum1x0 = vadd_s32(vget_low_s32(vacc1x0), vget_high_s32(vacc1x0));
+    const int32x2_t vpsum1x1 = vadd_s32(vget_low_s32(vacc1x1), vget_high_s32(vacc1x1));
+    const int32x2_t vpsum1x2 = vadd_s32(vget_low_s32(vacc1x2), vget_high_s32(vacc1x2));
+    const int32x2_t vpsum1x3 = vadd_s32(vget_low_s32(vacc1x3), vget_high_s32(vacc1x3));
+    const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1);
+    const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3);
+    int32x4_t vacc1x0123 = vcombine_s32(vsum1x01, vsum1x23 );
+    const int32x2_t vpsum1x4 = vadd_s32(vget_low_s32(vacc1x4), vget_high_s32(vacc1x4));
+    const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5));
+    const int32x2_t vpsum1x6 = vadd_s32(vget_low_s32(vacc1x6), vget_high_s32(vacc1x6));
+    const int32x2_t vpsum1x7 = vadd_s32(vget_low_s32(vacc1x7), vget_high_s32(vacc1x7));
+    const int32x2_t vsum1x45 = vpadd_s32(vpsum1x4, vpsum1x5);
+    const int32x2_t vsum1x67 = vpadd_s32(vpsum1x6, vpsum1x7);
+    int32x4_t vacc1x4567 = vcombine_s32(vsum1x45, vsum1x67 );
+    const int32x2_t vpsum1x8 = vadd_s32(vget_low_s32(vacc1x8), vget_high_s32(vacc1x8));
+    const int32x2_t vpsum1x9 = vadd_s32(vget_low_s32(vacc1x9), vget_high_s32(vacc1x9));
+    const int32x2_t vpsum1xA = vadd_s32(vget_low_s32(vacc1x10), vget_high_s32(vacc1x10));
+    const int32x2_t vpsum1xB = vadd_s32(vget_low_s32(vacc1x11), vget_high_s32(vacc1x11));
+    const int32x2_t vsum1x89 = vpadd_s32(vpsum1x8, vpsum1x9);
+    const int32x2_t vsum1xAB = vpadd_s32(vpsum1xA, vpsum1xB);
+    int32x4_t vacc1x89AB = vcombine_s32(vsum1x89, vsum1xAB );
+    const int32x2_t vpsum1xC = vadd_s32(vget_low_s32(vacc1x12), vget_high_s32(vacc1x12));
+    const int32x2_t vpsum1xD = vadd_s32(vget_low_s32(vacc1x13), vget_high_s32(vacc1x13));
+    const int32x2_t vpsum1xE = vadd_s32(vget_low_s32(vacc1x14), vget_high_s32(vacc1x14));
+    const int32x2_t vpsum1xF = vadd_s32(vget_low_s32(vacc1x15), vget_high_s32(vacc1x15));
+    const int32x2_t vsum1xCD = vpadd_s32(vpsum1xC, vpsum1xD);
+    const int32x2_t vsum1xEF = vpadd_s32(vpsum1xE, vpsum1xF);
+    int32x4_t vacc1xCDEF = vcombine_s32(vsum1xCD, vsum1xEF );
+#endif
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+    vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc1x89AB = vqrdmulhq_s32(vacc1x89AB, vmultiplier);
+    vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+    vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc1x89AB = vsraq_n_s32(vacc1x89AB, vbicq_s32(vacc1x89AB, vzero_shift_mask), 31);
+    vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_shift);
+    vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+    int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+    int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+    vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
+
+    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+    vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
+
+    if (nc >= 16) {
+      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+      vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
+      nc -= 16;
+    } else {
+      int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
+      if (nc & 8) {
+        vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
+        vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
+        vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
+      }
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/2x16c8-minmax-neon-mull-padal.c b/src/qs8-gemm/gen/2x16c8-minmax-neon-mull-padal.c
new file mode 100644
index 0000000..6b4b24d
--- /dev/null
+++ b/src/qs8-gemm/gen/2x16c8-minmax-neon-mull-padal.c
@@ -0,0 +1,350 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/c8-neon-mull-padal.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+
+  do {
+    int32x4_t vacc0x0 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x1 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x2 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x3 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x4 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x5 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x8 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x9 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x10 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x11 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x12 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x13 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x14 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x15 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc1x0 = vacc0x0;
+    int32x4_t vacc1x1 = vacc0x1;
+    int32x4_t vacc1x2 = vacc0x2;
+    int32x4_t vacc1x3 = vacc0x3;
+    int32x4_t vacc1x4 = vacc0x4;
+    int32x4_t vacc1x5 = vacc0x5;
+    int32x4_t vacc1x6 = vacc0x6;
+    int32x4_t vacc1x7 = vacc0x7;
+    int32x4_t vacc1x8 = vacc0x8;
+    int32x4_t vacc1x9 = vacc0x9;
+    int32x4_t vacc1x10 = vacc0x10;
+    int32x4_t vacc1x11 = vacc0x11;
+    int32x4_t vacc1x12 = vacc0x12;
+    int32x4_t vacc1x13 = vacc0x13;
+    int32x4_t vacc1x14 = vacc0x14;
+    int32x4_t vacc1x15 = vacc0x15;
+
+    size_t k = kc;
+
+    // Handle 8 bytes at a time using MUL.
+    while (k > 0) {
+      const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+
+      const int8x8_t vb0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x0 = vmull_s8(vb0, va0);
+      const int16x8_t vprod1x0 = vmull_s8(vb0, va1);
+      vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+      vacc1x0 = vpadalq_s16(vacc1x0, vprod1x0);
+      const int8x8_t vb1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x1 = vmull_s8(vb1, va0);
+      const int16x8_t vprod1x1 = vmull_s8(vb1, va1);
+      vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+      vacc1x1 = vpadalq_s16(vacc1x1, vprod1x1);
+      const int8x8_t vb2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x2 = vmull_s8(vb2, va0);
+      const int16x8_t vprod1x2 = vmull_s8(vb2, va1);
+      vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+      vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2);
+      const int8x8_t vb3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x3 = vmull_s8(vb3, va0);
+      const int16x8_t vprod1x3 = vmull_s8(vb3, va1);
+      vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+      vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3);
+      const int8x8_t vb4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x4 = vmull_s8(vb4, va0);
+      const int16x8_t vprod1x4 = vmull_s8(vb4, va1);
+      vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+      vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4);
+      const int8x8_t vb5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x5 = vmull_s8(vb5, va0);
+      const int16x8_t vprod1x5 = vmull_s8(vb5, va1);
+      vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+      vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5);
+      const int8x8_t vb6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x6 = vmull_s8(vb6, va0);
+      const int16x8_t vprod1x6 = vmull_s8(vb6, va1);
+      vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+      vacc1x6 = vpadalq_s16(vacc1x6, vprod1x6);
+      const int8x8_t vb7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x7 = vmull_s8(vb7, va0);
+      const int16x8_t vprod1x7 = vmull_s8(vb7, va1);
+      vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+      vacc1x7 = vpadalq_s16(vacc1x7, vprod1x7);
+      const int8x8_t vb8 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x8 = vmull_s8(vb8, va0);
+      const int16x8_t vprod1x8 = vmull_s8(vb8, va1);
+      vacc0x8 = vpadalq_s16(vacc0x8, vprod0x8);
+      vacc1x8 = vpadalq_s16(vacc1x8, vprod1x8);
+      const int8x8_t vb9 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x9 = vmull_s8(vb9, va0);
+      const int16x8_t vprod1x9 = vmull_s8(vb9, va1);
+      vacc0x9 = vpadalq_s16(vacc0x9, vprod0x9);
+      vacc1x9 = vpadalq_s16(vacc1x9, vprod1x9);
+      const int8x8_t vb10 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x10 = vmull_s8(vb10, va0);
+      const int16x8_t vprod1x10 = vmull_s8(vb10, va1);
+      vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10);
+      vacc1x10 = vpadalq_s16(vacc1x10, vprod1x10);
+      const int8x8_t vb11 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x11 = vmull_s8(vb11, va0);
+      const int16x8_t vprod1x11 = vmull_s8(vb11, va1);
+      vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11);
+      vacc1x11 = vpadalq_s16(vacc1x11, vprod1x11);
+      const int8x8_t vb12 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x12 = vmull_s8(vb12, va0);
+      const int16x8_t vprod1x12 = vmull_s8(vb12, va1);
+      vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12);
+      vacc1x12 = vpadalq_s16(vacc1x12, vprod1x12);
+      const int8x8_t vb13 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x13 = vmull_s8(vb13, va0);
+      const int16x8_t vprod1x13 = vmull_s8(vb13, va1);
+      vacc0x13 = vpadalq_s16(vacc0x13, vprod0x13);
+      vacc1x13 = vpadalq_s16(vacc1x13, vprod1x13);
+      const int8x8_t vb14 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x14 = vmull_s8(vb14, va0);
+      const int16x8_t vprod1x14 = vmull_s8(vb14, va1);
+      vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14);
+      vacc1x14 = vpadalq_s16(vacc1x14, vprod1x14);
+      const int8x8_t vb15 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x15 = vmull_s8(vb15, va0);
+      const int16x8_t vprod1x15 = vmull_s8(vb15, va1);
+      vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15);
+      vacc1x15 = vpadalq_s16(vacc1x15, vprod1x15);
+
+      k -= 8 * sizeof(int8_t);
+    }
+
+#if XNN_ARCH_ARM64
+    const int32x4_t vsum0x01 = vpaddq_s32(vacc0x0, vacc0x1);
+    const int32x4_t vsum0x23 = vpaddq_s32(vacc0x2, vacc0x3);
+    const int32x4_t vsum0x45 = vpaddq_s32(vacc0x4, vacc0x5);
+    const int32x4_t vsum0x67 = vpaddq_s32(vacc0x6, vacc0x7);
+    const int32x4_t vsum0x89 = vpaddq_s32(vacc0x8, vacc0x9);
+    const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11);
+    const int32x4_t vsum0xCD = vpaddq_s32(vacc0x12, vacc0x13);
+    const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15);
+    const int32x4_t vsum1x01 = vpaddq_s32(vacc1x0, vacc1x1);
+    const int32x4_t vsum1x23 = vpaddq_s32(vacc1x2, vacc1x3);
+    const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5);
+    const int32x4_t vsum1x67 = vpaddq_s32(vacc1x6, vacc1x7);
+    const int32x4_t vsum1x89 = vpaddq_s32(vacc1x8, vacc1x9);
+    const int32x4_t vsum1xAB = vpaddq_s32(vacc1x10, vacc1x11);
+    const int32x4_t vsum1xCD = vpaddq_s32(vacc1x12, vacc1x13);
+    const int32x4_t vsum1xEF = vpaddq_s32(vacc1x14, vacc1x15);
+    int32x4_t vacc0x0123 = vpaddq_s32(vsum0x01, vsum0x23);
+    int32x4_t vacc0x4567 = vpaddq_s32(vsum0x45, vsum0x67);
+    int32x4_t vacc0x89AB = vpaddq_s32(vsum0x89, vsum0xAB);
+    int32x4_t vacc0xCDEF = vpaddq_s32(vsum0xCD, vsum0xEF);
+    int32x4_t vacc1x0123 = vpaddq_s32(vsum1x01, vsum1x23);
+    int32x4_t vacc1x4567 = vpaddq_s32(vsum1x45, vsum1x67);
+    int32x4_t vacc1x89AB = vpaddq_s32(vsum1x89, vsum1xAB);
+    int32x4_t vacc1xCDEF = vpaddq_s32(vsum1xCD, vsum1xEF);
+#else
+    const int32x2_t vpsum0x0 = vadd_s32(vget_low_s32(vacc0x0), vget_high_s32(vacc0x0));
+    const int32x2_t vpsum0x1 = vadd_s32(vget_low_s32(vacc0x1), vget_high_s32(vacc0x1));
+    const int32x2_t vpsum0x2 = vadd_s32(vget_low_s32(vacc0x2), vget_high_s32(vacc0x2));
+    const int32x2_t vpsum0x3 = vadd_s32(vget_low_s32(vacc0x3), vget_high_s32(vacc0x3));
+    const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1);
+    const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3);
+    int32x4_t vacc0x0123 = vcombine_s32(vsum0x01, vsum0x23 );
+    const int32x2_t vpsum0x4 = vadd_s32(vget_low_s32(vacc0x4), vget_high_s32(vacc0x4));
+    const int32x2_t vpsum0x5 = vadd_s32(vget_low_s32(vacc0x5), vget_high_s32(vacc0x5));
+    const int32x2_t vpsum0x6 = vadd_s32(vget_low_s32(vacc0x6), vget_high_s32(vacc0x6));
+    const int32x2_t vpsum0x7 = vadd_s32(vget_low_s32(vacc0x7), vget_high_s32(vacc0x7));
+    const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5);
+    const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7);
+    int32x4_t vacc0x4567 = vcombine_s32(vsum0x45, vsum0x67 );
+    const int32x2_t vpsum0x8 = vadd_s32(vget_low_s32(vacc0x8), vget_high_s32(vacc0x8));
+    const int32x2_t vpsum0x9 = vadd_s32(vget_low_s32(vacc0x9), vget_high_s32(vacc0x9));
+    const int32x2_t vpsum0xA = vadd_s32(vget_low_s32(vacc0x10), vget_high_s32(vacc0x10));
+    const int32x2_t vpsum0xB = vadd_s32(vget_low_s32(vacc0x11), vget_high_s32(vacc0x11));
+    const int32x2_t vsum0x89 = vpadd_s32(vpsum0x8, vpsum0x9);
+    const int32x2_t vsum0xAB = vpadd_s32(vpsum0xA, vpsum0xB);
+    int32x4_t vacc0x89AB = vcombine_s32(vsum0x89, vsum0xAB );
+    const int32x2_t vpsum0xC = vadd_s32(vget_low_s32(vacc0x12), vget_high_s32(vacc0x12));
+    const int32x2_t vpsum0xD = vadd_s32(vget_low_s32(vacc0x13), vget_high_s32(vacc0x13));
+    const int32x2_t vpsum0xE = vadd_s32(vget_low_s32(vacc0x14), vget_high_s32(vacc0x14));
+    const int32x2_t vpsum0xF = vadd_s32(vget_low_s32(vacc0x15), vget_high_s32(vacc0x15));
+    const int32x2_t vsum0xCD = vpadd_s32(vpsum0xC, vpsum0xD);
+    const int32x2_t vsum0xEF = vpadd_s32(vpsum0xE, vpsum0xF);
+    int32x4_t vacc0xCDEF = vcombine_s32(vsum0xCD, vsum0xEF );
+    const int32x2_t vpsum1x0 = vadd_s32(vget_low_s32(vacc1x0), vget_high_s32(vacc1x0));
+    const int32x2_t vpsum1x1 = vadd_s32(vget_low_s32(vacc1x1), vget_high_s32(vacc1x1));
+    const int32x2_t vpsum1x2 = vadd_s32(vget_low_s32(vacc1x2), vget_high_s32(vacc1x2));
+    const int32x2_t vpsum1x3 = vadd_s32(vget_low_s32(vacc1x3), vget_high_s32(vacc1x3));
+    const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1);
+    const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3);
+    int32x4_t vacc1x0123 = vcombine_s32(vsum1x01, vsum1x23 );
+    const int32x2_t vpsum1x4 = vadd_s32(vget_low_s32(vacc1x4), vget_high_s32(vacc1x4));
+    const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5));
+    const int32x2_t vpsum1x6 = vadd_s32(vget_low_s32(vacc1x6), vget_high_s32(vacc1x6));
+    const int32x2_t vpsum1x7 = vadd_s32(vget_low_s32(vacc1x7), vget_high_s32(vacc1x7));
+    const int32x2_t vsum1x45 = vpadd_s32(vpsum1x4, vpsum1x5);
+    const int32x2_t vsum1x67 = vpadd_s32(vpsum1x6, vpsum1x7);
+    int32x4_t vacc1x4567 = vcombine_s32(vsum1x45, vsum1x67 );
+    const int32x2_t vpsum1x8 = vadd_s32(vget_low_s32(vacc1x8), vget_high_s32(vacc1x8));
+    const int32x2_t vpsum1x9 = vadd_s32(vget_low_s32(vacc1x9), vget_high_s32(vacc1x9));
+    const int32x2_t vpsum1xA = vadd_s32(vget_low_s32(vacc1x10), vget_high_s32(vacc1x10));
+    const int32x2_t vpsum1xB = vadd_s32(vget_low_s32(vacc1x11), vget_high_s32(vacc1x11));
+    const int32x2_t vsum1x89 = vpadd_s32(vpsum1x8, vpsum1x9);
+    const int32x2_t vsum1xAB = vpadd_s32(vpsum1xA, vpsum1xB);
+    int32x4_t vacc1x89AB = vcombine_s32(vsum1x89, vsum1xAB );
+    const int32x2_t vpsum1xC = vadd_s32(vget_low_s32(vacc1x12), vget_high_s32(vacc1x12));
+    const int32x2_t vpsum1xD = vadd_s32(vget_low_s32(vacc1x13), vget_high_s32(vacc1x13));
+    const int32x2_t vpsum1xE = vadd_s32(vget_low_s32(vacc1x14), vget_high_s32(vacc1x14));
+    const int32x2_t vpsum1xF = vadd_s32(vget_low_s32(vacc1x15), vget_high_s32(vacc1x15));
+    const int32x2_t vsum1xCD = vpadd_s32(vpsum1xC, vpsum1xD);
+    const int32x2_t vsum1xEF = vpadd_s32(vpsum1xE, vpsum1xF);
+    int32x4_t vacc1xCDEF = vcombine_s32(vsum1xCD, vsum1xEF );
+#endif
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+    vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc1x89AB = vqrdmulhq_s32(vacc1x89AB, vmultiplier);
+    vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+    vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc1x89AB = vsraq_n_s32(vacc1x89AB, vbicq_s32(vacc1x89AB, vzero_shift_mask), 31);
+    vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_shift);
+    vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+    int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+    int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+    vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
+
+    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+    vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
+
+    if (nc >= 16) {
+      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+      vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
+      nc -= 16;
+    } else {
+      int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
+      if (nc & 8) {
+        vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
+        vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
+        vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
+      }
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-sse2-ld128.c b/src/qs8-gemm/gen/2x4c8-minmax-sse2-ld128.c
index 2dea0d2..d3f8bc0 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-sse2-ld128.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-sse2-ld128.c
@@ -12,6 +12,7 @@
 #include <emmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_2x4c8__sse2_ld128(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -166,12 +168,12 @@
       vout = _mm_srli_si128(vout, 4);
       *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-sse2-ld64.c b/src/qs8-gemm/gen/2x4c8-minmax-sse2-ld64.c
index 52c118d..61a1344 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-sse2-ld64.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-sse2-ld64.c
@@ -12,6 +12,7 @@
 #include <emmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_2x4c8__sse2_ld64(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -168,12 +170,12 @@
       vout = _mm_srli_si128(vout, 4);
       *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-sse41-ld128.c b/src/qs8-gemm/gen/2x4c8-minmax-sse41-ld128.c
index 166947b..94bd5be 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-sse41-ld128.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-sse41-ld128.c
@@ -12,6 +12,7 @@
 #include <smmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -141,12 +143,12 @@
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-sse41-ld64.c b/src/qs8-gemm/gen/2x4c8-minmax-sse41-ld64.c
index 139c4cf..3765c44 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-sse41-ld64.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-sse41-ld64.c
@@ -12,6 +12,7 @@
 #include <smmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -143,12 +145,12 @@
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-ssse3-ld128.c b/src/qs8-gemm/gen/2x4c8-minmax-ssse3-ld128.c
index 0d0262a..10307ca 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-ssse3-ld128.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-ssse3-ld128.c
@@ -12,6 +12,7 @@
 #include <tmmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_2x4c8__ssse3_ld128(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -166,12 +168,12 @@
       vout = _mm_srli_si128(vout, 4);
       *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-ssse3-ld64.c b/src/qs8-gemm/gen/2x4c8-minmax-ssse3-ld64.c
index 08b9f64..59cf669 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-ssse3-ld64.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-ssse3-ld64.c
@@ -12,6 +12,7 @@
 #include <tmmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_2x4c8__ssse3_ld64(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -168,12 +170,12 @@
       vout = _mm_srli_si128(vout, 4);
       *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-wasmsimd-ld128.c b/src/qs8-gemm/gen/2x4c8-minmax-wasmsimd-ld128.c
index a77acde..2ad04a9 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-wasmsimd-ld128.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-wasmsimd-ld128.c
@@ -12,6 +12,7 @@
 #include <wasm_simd128.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_2x4c8__wasmsimd_ld128(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -155,12 +157,12 @@
       *((float*) c0) = (float) wasm_f32x4_extract_lane(vout, 0);
       *((float*) c1) = (float) wasm_f32x4_extract_lane(vout, 1);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-wasmsimd-ld64.c b/src/qs8-gemm/gen/2x4c8-minmax-wasmsimd-ld64.c
index c3f5483..a00ad6c 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-wasmsimd-ld64.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-wasmsimd-ld64.c
@@ -12,6 +12,7 @@
 #include <wasm_simd128.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_2x4c8__wasmsimd_ld64(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -151,12 +153,12 @@
       *((float*) c0) = (float) wasm_f32x4_extract_lane(vout, 0);
       *((float*) c1) = (float) wasm_f32x4_extract_lane(vout, 1);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-xop-ld128.c b/src/qs8-gemm/gen/2x4c8-minmax-xop-ld128.c
index 83ccb6a..610ecd3 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-xop-ld128.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-xop-ld128.c
@@ -17,6 +17,7 @@
 #endif
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128(
@@ -40,6 +41,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -146,12 +148,12 @@
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-xop-ld64.c b/src/qs8-gemm/gen/2x4c8-minmax-xop-ld64.c
index e918a7a..5ef963f 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-xop-ld64.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-xop-ld64.c
@@ -17,6 +17,7 @@
 #endif
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64(
@@ -40,6 +41,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -148,12 +150,12 @@
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/2x4c8-xw-minmax-sse2.c b/src/qs8-gemm/gen/2x4c8-xw-minmax-sse2.c
index 38cc155..5336fcb 100644
--- a/src/qs8-gemm/gen/2x4c8-xw-minmax-sse2.c
+++ b/src/qs8-gemm/gen/2x4c8-xw-minmax-sse2.c
@@ -12,6 +12,7 @@
 #include <emmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse2(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -164,12 +166,12 @@
       vout = _mm_srli_si128(vout, 4);
       *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/2x4c8-xw-minmax-sse41.c b/src/qs8-gemm/gen/2x4c8-xw-minmax-sse41.c
index 0202b27..c6cca2c 100644
--- a/src/qs8-gemm/gen/2x4c8-xw-minmax-sse41.c
+++ b/src/qs8-gemm/gen/2x4c8-xw-minmax-sse41.c
@@ -12,6 +12,7 @@
 #include <smmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -139,12 +141,12 @@
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/2x4c8-xw-minmax-ssse3.c b/src/qs8-gemm/gen/2x4c8-xw-minmax-ssse3.c
index 79f8767..521105b 100644
--- a/src/qs8-gemm/gen/2x4c8-xw-minmax-ssse3.c
+++ b/src/qs8-gemm/gen/2x4c8-xw-minmax-ssse3.c
@@ -12,6 +12,7 @@
 #include <tmmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__ssse3(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -164,12 +166,12 @@
       vout = _mm_srli_si128(vout, 4);
       *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/2x4c8-xw-minmax-wasmsimd.c b/src/qs8-gemm/gen/2x4c8-xw-minmax-wasmsimd.c
index 00a234e..2a11031 100644
--- a/src/qs8-gemm/gen/2x4c8-xw-minmax-wasmsimd.c
+++ b/src/qs8-gemm/gen/2x4c8-xw-minmax-wasmsimd.c
@@ -12,6 +12,7 @@
 #include <wasm_simd128.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__wasmsimd(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -151,12 +153,12 @@
       *((float*) c0) = (float) wasm_f32x4_extract_lane(vout, 0);
       *((float*) c1) = (float) wasm_f32x4_extract_lane(vout, 1);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/2x4c8-xw-minmax-xop.c b/src/qs8-gemm/gen/2x4c8-xw-minmax-xop.c
index 1969160..6e41cc5 100644
--- a/src/qs8-gemm/gen/2x4c8-xw-minmax-xop.c
+++ b/src/qs8-gemm/gen/2x4c8-xw-minmax-xop.c
@@ -17,6 +17,7 @@
 #endif
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop(
@@ -40,6 +41,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -144,12 +146,12 @@
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/2x8-minmax-neon-mull-addw-dup.c b/src/qs8-gemm/gen/2x8-minmax-neon-mull-addw-dup.c
new file mode 100644
index 0000000..b5a7b2f
--- /dev/null
+++ b/src/qs8-gemm/gen/2x8-minmax-neon-mull-addw-dup.c
@@ -0,0 +1,272 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/neon-mull-addw-dup.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/gemm.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc1x0123 = vacc0x0123;
+    int32x4_t vacc1x4567 = vacc0x4567;
+
+    size_t k = kc;
+    while (k >= 8 * sizeof(int8_t)) {
+      const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+
+      const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c0));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c0));
+      const int16x8_t vprod1x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va1, 0));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c0));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c0));
+      const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c1));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c1));
+      const int16x8_t vprod1x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va1, 1));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c1));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c1));
+      const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va0, 2));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c2));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c2));
+      const int16x8_t vprod1x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va1, 2));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c2));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c2));
+      const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3));
+      const int16x8_t vprod1x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va1, 3));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c3));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c3));
+      const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va0, 4));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c4));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c4));
+      const int16x8_t vprod1x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va1, 4));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c4));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c4));
+      const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va0, 5));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c5));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c5));
+      const int16x8_t vprod1x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va1, 5));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c5));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c5));
+      const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va0, 6));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c6));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c6));
+      const int16x8_t vprod1x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va1, 6));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c6));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c6));
+      const int8x8_t vb01234567c7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va0, 7));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c7));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c7));
+      const int16x8_t vprod1x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va1, 7));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c7));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c7));
+
+      k -= 8 * sizeof(int8_t);
+    }
+    if XNN_UNLIKELY(k != 0) {
+      const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+      const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+
+      const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c0));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c0));
+      const int16x8_t vprod1x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va1, 0));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c0));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c0));
+
+      if (k >= 2 * sizeof(int8_t)) {
+        const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c1));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c1));
+        const int16x8_t vprod1x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va1, 1));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c1));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c1));
+
+        if (k > 2 * sizeof(int8_t)) {
+          const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          const int16x8_t vprod0x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va0, 2));
+          vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c2));
+          vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c2));
+          const int16x8_t vprod1x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va1, 2));
+          vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c2));
+          vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c2));
+
+          if (k >= 4 * sizeof(int8_t)) {
+            const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+            const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3));
+            vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3));
+            vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3));
+            const int16x8_t vprod1x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va1, 3));
+            vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c3));
+            vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c3));
+
+            if (k > 4 * sizeof(int8_t)) {
+              const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+              const int16x8_t vprod0x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va0, 4));
+              vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c4));
+              vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c4));
+              const int16x8_t vprod1x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va1, 4));
+              vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c4));
+              vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c4));
+
+              if (k >= 6 * sizeof(int8_t)) {
+                const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+                const int16x8_t vprod0x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va0, 5));
+                vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c5));
+                vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c5));
+                const int16x8_t vprod1x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va1, 5));
+                vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c5));
+                vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c5));
+
+                if (k > 6 * sizeof(int8_t)) {
+                  const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+                  const int16x8_t vprod0x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va0, 6));
+                  vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c6));
+                  vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c6));
+                  const int16x8_t vprod1x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va1, 6));
+                  vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c6));
+                  vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c6));
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);
+
+    vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);
+
+    if (nc >= 8) {
+      vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
+      vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/2x8c16-minmax-neon-mlal-padal.c b/src/qs8-gemm/gen/2x8c16-minmax-neon-mlal-padal.c
new file mode 100644
index 0000000..a2d58a2
--- /dev/null
+++ b/src/qs8-gemm/gen/2x8c16-minmax-neon-mlal-padal.c
@@ -0,0 +1,244 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/c16-neon-mlal-padal.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 16);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+
+  do {
+    int32x4_t vacc0x0 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x1 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x2 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x3 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x4 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x5 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc1x0 = vacc0x0;
+    int32x4_t vacc1x1 = vacc0x1;
+    int32x4_t vacc1x2 = vacc0x2;
+    int32x4_t vacc1x3 = vacc0x3;
+    int32x4_t vacc1x4 = vacc0x4;
+    int32x4_t vacc1x5 = vacc0x5;
+    int32x4_t vacc1x6 = vacc0x6;
+    int32x4_t vacc1x7 = vacc0x7;
+
+    // KC loop of 16
+    size_t k = 0;
+    while (k < kc) {
+      const int8x16_t va0 = vld1q_s8(a0); a0 += 16;
+      const int8x16_t va1 = vld1q_s8(a1); a1 += 16;
+
+      const int8x16_t vb0 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb1 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb2 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb3 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb4 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb5 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb6 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb7 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+
+      int16x8_t vprod0x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va0));
+      int16x8_t vprod1x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va1));
+      vprod0x0 = vmlal_s8(vprod0x0, vget_high_s8(vb0), vget_high_s8(va0));
+      vprod1x0 = vmlal_s8(vprod1x0, vget_high_s8(vb0), vget_high_s8(va1));
+      vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+      vacc1x0 = vpadalq_s16(vacc1x0, vprod1x0);
+      int16x8_t vprod0x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va0));
+      int16x8_t vprod1x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va1));
+      vprod0x1 = vmlal_s8(vprod0x1, vget_high_s8(vb1), vget_high_s8(va0));
+      vprod1x1 = vmlal_s8(vprod1x1, vget_high_s8(vb1), vget_high_s8(va1));
+      vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+      vacc1x1 = vpadalq_s16(vacc1x1, vprod1x1);
+      int16x8_t vprod0x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va0));
+      int16x8_t vprod1x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va1));
+      vprod0x2 = vmlal_s8(vprod0x2, vget_high_s8(vb2), vget_high_s8(va0));
+      vprod1x2 = vmlal_s8(vprod1x2, vget_high_s8(vb2), vget_high_s8(va1));
+      vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+      vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2);
+      int16x8_t vprod0x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va0));
+      int16x8_t vprod1x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va1));
+      vprod0x3 = vmlal_s8(vprod0x3, vget_high_s8(vb3), vget_high_s8(va0));
+      vprod1x3 = vmlal_s8(vprod1x3, vget_high_s8(vb3), vget_high_s8(va1));
+      vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+      vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3);
+      int16x8_t vprod0x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va0));
+      int16x8_t vprod1x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va1));
+      vprod0x4 = vmlal_s8(vprod0x4, vget_high_s8(vb4), vget_high_s8(va0));
+      vprod1x4 = vmlal_s8(vprod1x4, vget_high_s8(vb4), vget_high_s8(va1));
+      vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+      vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4);
+      int16x8_t vprod0x5 = vmull_s8(vget_low_s8(vb5), vget_low_s8(va0));
+      int16x8_t vprod1x5 = vmull_s8(vget_low_s8(vb5), vget_low_s8(va1));
+      vprod0x5 = vmlal_s8(vprod0x5, vget_high_s8(vb5), vget_high_s8(va0));
+      vprod1x5 = vmlal_s8(vprod1x5, vget_high_s8(vb5), vget_high_s8(va1));
+      vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+      vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5);
+      int16x8_t vprod0x6 = vmull_s8(vget_low_s8(vb6), vget_low_s8(va0));
+      int16x8_t vprod1x6 = vmull_s8(vget_low_s8(vb6), vget_low_s8(va1));
+      vprod0x6 = vmlal_s8(vprod0x6, vget_high_s8(vb6), vget_high_s8(va0));
+      vprod1x6 = vmlal_s8(vprod1x6, vget_high_s8(vb6), vget_high_s8(va1));
+      vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+      vacc1x6 = vpadalq_s16(vacc1x6, vprod1x6);
+      int16x8_t vprod0x7 = vmull_s8(vget_low_s8(vb7), vget_low_s8(va0));
+      int16x8_t vprod1x7 = vmull_s8(vget_low_s8(vb7), vget_low_s8(va1));
+      vprod0x7 = vmlal_s8(vprod0x7, vget_high_s8(vb7), vget_high_s8(va0));
+      vprod1x7 = vmlal_s8(vprod1x7, vget_high_s8(vb7), vget_high_s8(va1));
+      vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+      vacc1x7 = vpadalq_s16(vacc1x7, vprod1x7);
+
+      k += 16 * sizeof(int8_t);
+    }
+
+#if XNN_ARCH_ARM64
+    const int32x4_t vsum0x01 = vpaddq_s32(vacc0x0, vacc0x1);
+    const int32x4_t vsum0x23 = vpaddq_s32(vacc0x2, vacc0x3);
+    const int32x4_t vsum0x45 = vpaddq_s32(vacc0x4, vacc0x5);
+    const int32x4_t vsum0x67 = vpaddq_s32(vacc0x6, vacc0x7);
+    const int32x4_t vsum1x01 = vpaddq_s32(vacc1x0, vacc1x1);
+    const int32x4_t vsum1x23 = vpaddq_s32(vacc1x2, vacc1x3);
+    const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5);
+    const int32x4_t vsum1x67 = vpaddq_s32(vacc1x6, vacc1x7);
+    int32x4_t vacc0x0123 = vpaddq_s32(vsum0x01, vsum0x23);
+    int32x4_t vacc0x4567 = vpaddq_s32(vsum0x45, vsum0x67);
+    int32x4_t vacc1x0123 = vpaddq_s32(vsum1x01, vsum1x23);
+    int32x4_t vacc1x4567 = vpaddq_s32(vsum1x45, vsum1x67);
+#else
+    const int32x2_t vpsum0x0 = vadd_s32(vget_low_s32(vacc0x0), vget_high_s32(vacc0x0));
+    const int32x2_t vpsum0x1 = vadd_s32(vget_low_s32(vacc0x1), vget_high_s32(vacc0x1));
+    const int32x2_t vpsum0x2 = vadd_s32(vget_low_s32(vacc0x2), vget_high_s32(vacc0x2));
+    const int32x2_t vpsum0x3 = vadd_s32(vget_low_s32(vacc0x3), vget_high_s32(vacc0x3));
+    const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1);
+    const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3);
+    int32x4_t vacc0x0123 = vcombine_s32(vsum0x01, vsum0x23 );
+    const int32x2_t vpsum0x4 = vadd_s32(vget_low_s32(vacc0x4), vget_high_s32(vacc0x4));
+    const int32x2_t vpsum0x5 = vadd_s32(vget_low_s32(vacc0x5), vget_high_s32(vacc0x5));
+    const int32x2_t vpsum0x6 = vadd_s32(vget_low_s32(vacc0x6), vget_high_s32(vacc0x6));
+    const int32x2_t vpsum0x7 = vadd_s32(vget_low_s32(vacc0x7), vget_high_s32(vacc0x7));
+    const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5);
+    const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7);
+    int32x4_t vacc0x4567 = vcombine_s32(vsum0x45, vsum0x67 );
+    const int32x2_t vpsum1x0 = vadd_s32(vget_low_s32(vacc1x0), vget_high_s32(vacc1x0));
+    const int32x2_t vpsum1x1 = vadd_s32(vget_low_s32(vacc1x1), vget_high_s32(vacc1x1));
+    const int32x2_t vpsum1x2 = vadd_s32(vget_low_s32(vacc1x2), vget_high_s32(vacc1x2));
+    const int32x2_t vpsum1x3 = vadd_s32(vget_low_s32(vacc1x3), vget_high_s32(vacc1x3));
+    const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1);
+    const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3);
+    int32x4_t vacc1x0123 = vcombine_s32(vsum1x01, vsum1x23 );
+    const int32x2_t vpsum1x4 = vadd_s32(vget_low_s32(vacc1x4), vget_high_s32(vacc1x4));
+    const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5));
+    const int32x2_t vpsum1x6 = vadd_s32(vget_low_s32(vacc1x6), vget_high_s32(vacc1x6));
+    const int32x2_t vpsum1x7 = vadd_s32(vget_low_s32(vacc1x7), vget_high_s32(vacc1x7));
+    const int32x2_t vsum1x45 = vpadd_s32(vpsum1x4, vpsum1x5);
+    const int32x2_t vsum1x67 = vpadd_s32(vpsum1x6, vpsum1x7);
+    int32x4_t vacc1x4567 = vcombine_s32(vsum1x45, vsum1x67 );
+#endif
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);
+
+    vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);
+
+    if (nc >= 8) {
+      vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
+      vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/2x8c2-minmax-neon-mlal-padal-dup.c b/src/qs8-gemm/gen/2x8c2-minmax-neon-mlal-padal-dup.c
new file mode 100644
index 0000000..850d04d
--- /dev/null
+++ b/src/qs8-gemm/gen/2x8c2-minmax-neon-mlal-padal-dup.c
@@ -0,0 +1,292 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/c2-neon-mull-padal-dup.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc1x0123 = vacc0x0123;
+    int32x4_t vacc1x4567 = vacc0x4567;
+
+    size_t k = kc;
+
+    while (k >= 16 * sizeof(int8_t)) {
+      const int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va0x1 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va1x0 = vld1_s8(a1); a1 += 8;
+      const int8x8_t va1x1 = vld1_s8(a1); a1 += 8;
+
+      const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+      int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+      const int8x8_t vb0123c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+      vprod1x0123c0 = vmlal_s8(vprod1x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+      int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+      int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+      const int8x8_t vb4567c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+      vprod1x4567c0 = vmlal_s8(vprod1x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+      int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+      int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+      const int8x8_t vb0123c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x0123c1 = vmlal_s8(vprod0x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+      vprod1x0123c1 = vmlal_s8(vprod1x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+      int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+      int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+      const int8x8_t vb4567c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x4567c1 = vmlal_s8(vprod0x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+      vprod1x4567c1 = vmlal_s8(vprod1x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+      int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+      int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+      const int8x8_t vb0123c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x0123c2 = vmlal_s8(vprod0x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+      vprod1x0123c2 = vmlal_s8(vprod1x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+      int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+      int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+      const int8x8_t vb4567c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x4567c2 = vmlal_s8(vprod0x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+      vprod1x4567c2 = vmlal_s8(vprod1x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+      int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+      int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+      const int8x8_t vb0123c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x0123c3 = vmlal_s8(vprod0x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+      vprod1x0123c3 = vmlal_s8(vprod1x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+      int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+      int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+      const int8x8_t vb4567c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x4567c3 = vmlal_s8(vprod0x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+      vprod1x4567c3 = vmlal_s8(vprod1x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+
+      k -= 16 * sizeof(int8_t);
+    }
+
+    if (k >= 8 * sizeof(int8_t)) {
+      const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+
+      const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+      const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+      const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+      const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+      const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+      const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+      const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+      const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+      const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+      const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+      const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+      const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+
+      k -= 8 * sizeof(int8_t);
+    }
+
+    if XNN_UNLIKELY(k != 0) {
+      const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+      const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+
+      const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+      const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+      const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+      const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+
+      if (k > 2 * sizeof(int8_t)) {
+        const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+        const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+        const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+        const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+
+        if (k > 4 * sizeof(int8_t)) {
+          const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+          vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+          const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+          vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+          const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+          vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+          const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+          vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+        }
+      }
+    }
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);
+
+    vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);
+
+    if (nc >= 8) {
+      vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
+      vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/2x8c2-minmax-neon-mull-padal-dup.c b/src/qs8-gemm/gen/2x8c2-minmax-neon-mull-padal-dup.c
new file mode 100644
index 0000000..a98d1e9
--- /dev/null
+++ b/src/qs8-gemm/gen/2x8c2-minmax-neon-mull-padal-dup.c
@@ -0,0 +1,218 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/c2-neon-mull-padal-dup.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mull_padal_dup(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc1x0123 = vacc0x0123;
+    int32x4_t vacc1x4567 = vacc0x4567;
+
+    size_t k = kc;
+
+
+    while (k >= 8 * sizeof(int8_t)) {
+      const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+
+      const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+      const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+      const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+      const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+      const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+      const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+      const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+      const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+      const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+      const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+      const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+      const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+
+      k -= 8 * sizeof(int8_t);
+    }
+
+    if XNN_UNLIKELY(k != 0) {
+      const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+      const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+
+      const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+      const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+      const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+      const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+
+      if (k > 2 * sizeof(int8_t)) {
+        const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+        const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+        const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+        const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+
+        if (k > 4 * sizeof(int8_t)) {
+          const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+          vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+          const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+          vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+          const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+          vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+          const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+          vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+        }
+      }
+    }
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);
+
+    vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);
+
+    if (nc >= 8) {
+      vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
+      vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/2x8c8-minmax-avx2.c b/src/qs8-gemm/gen/2x8c8-minmax-avx2.c
index 677a96b..f449528 100644
--- a/src/qs8-gemm/gen/2x8c8-minmax-avx2.c
+++ b/src/qs8-gemm/gen/2x8c8-minmax-avx2.c
@@ -13,6 +13,7 @@
 
 #include <xnnpack/gemm.h>
 #include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2(
@@ -36,6 +37,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -160,12 +162,12 @@
       _mm_storel_epi64((__m128i*) c0, vout_lo);
       _mm_storel_epi64((__m128i*) c1, vout_hi);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
       nc -= 8;
     } else {
       if (nc & 4) {
diff --git a/src/qs8-gemm/gen/2x8c8-minmax-neon-mlal-padal.c b/src/qs8-gemm/gen/2x8c8-minmax-neon-mlal-padal.c
new file mode 100644
index 0000000..0dd5411
--- /dev/null
+++ b/src/qs8-gemm/gen/2x8c8-minmax-neon-mlal-padal.c
@@ -0,0 +1,303 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/c8-neon-mull-padal.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+
+  do {
+    int32x4_t vacc0x0 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x1 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x2 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x3 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x4 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x5 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc1x0 = vacc0x0;
+    int32x4_t vacc1x1 = vacc0x1;
+    int32x4_t vacc1x2 = vacc0x2;
+    int32x4_t vacc1x3 = vacc0x3;
+    int32x4_t vacc1x4 = vacc0x4;
+    int32x4_t vacc1x5 = vacc0x5;
+    int32x4_t vacc1x6 = vacc0x6;
+    int32x4_t vacc1x7 = vacc0x7;
+
+    size_t k = kc;
+    // 2x partial unrolled loop to load 16 bytes at a time using MLA.
+    while (k >= 16 * sizeof(int8_t)) {
+      const int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va0x1 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va1x0 = vld1_s8(a1); a1 += 8;
+      const int8x8_t va1x1 = vld1_s8(a1); a1 += 8;
+
+      const int8x8_t vb0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb4x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb5x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb6x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb7x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+
+      const int8x8_t vb0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x0 = vmull_s8(vb0x0, va0x0);
+      int16x8_t vprod1x0 = vmull_s8(vb0x0, va1x0);
+      vprod0x0 = vmlal_s8(vprod0x0, vb0x1, va0x1);
+      vprod1x0 = vmlal_s8(vprod1x0, vb0x1, va1x1);
+      vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+      vacc1x0 = vpadalq_s16(vacc1x0, vprod1x0);
+      const int8x8_t vb1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x1 = vmull_s8(vb1x0, va0x0);
+      int16x8_t vprod1x1 = vmull_s8(vb1x0, va1x0);
+      vprod0x1 = vmlal_s8(vprod0x1, vb1x1, va0x1);
+      vprod1x1 = vmlal_s8(vprod1x1, vb1x1, va1x1);
+      vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+      vacc1x1 = vpadalq_s16(vacc1x1, vprod1x1);
+      const int8x8_t vb2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x2 = vmull_s8(vb2x0, va0x0);
+      int16x8_t vprod1x2 = vmull_s8(vb2x0, va1x0);
+      vprod0x2 = vmlal_s8(vprod0x2, vb2x1, va0x1);
+      vprod1x2 = vmlal_s8(vprod1x2, vb2x1, va1x1);
+      vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+      vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2);
+      const int8x8_t vb3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x3 = vmull_s8(vb3x0, va0x0);
+      int16x8_t vprod1x3 = vmull_s8(vb3x0, va1x0);
+      vprod0x3 = vmlal_s8(vprod0x3, vb3x1, va0x1);
+      vprod1x3 = vmlal_s8(vprod1x3, vb3x1, va1x1);
+      vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+      vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3);
+      const int8x8_t vb4x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x4 = vmull_s8(vb4x0, va0x0);
+      int16x8_t vprod1x4 = vmull_s8(vb4x0, va1x0);
+      vprod0x4 = vmlal_s8(vprod0x4, vb4x1, va0x1);
+      vprod1x4 = vmlal_s8(vprod1x4, vb4x1, va1x1);
+      vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+      vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4);
+      const int8x8_t vb5x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x5 = vmull_s8(vb5x0, va0x0);
+      int16x8_t vprod1x5 = vmull_s8(vb5x0, va1x0);
+      vprod0x5 = vmlal_s8(vprod0x5, vb5x1, va0x1);
+      vprod1x5 = vmlal_s8(vprod1x5, vb5x1, va1x1);
+      vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+      vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5);
+      const int8x8_t vb6x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x6 = vmull_s8(vb6x0, va0x0);
+      int16x8_t vprod1x6 = vmull_s8(vb6x0, va1x0);
+      vprod0x6 = vmlal_s8(vprod0x6, vb6x1, va0x1);
+      vprod1x6 = vmlal_s8(vprod1x6, vb6x1, va1x1);
+      vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+      vacc1x6 = vpadalq_s16(vacc1x6, vprod1x6);
+      const int8x8_t vb7x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x7 = vmull_s8(vb7x0, va0x0);
+      int16x8_t vprod1x7 = vmull_s8(vb7x0, va1x0);
+      vprod0x7 = vmlal_s8(vprod0x7, vb7x1, va0x1);
+      vprod1x7 = vmlal_s8(vprod1x7, vb7x1, va1x1);
+      vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+      vacc1x7 = vpadalq_s16(vacc1x7, vprod1x7);
+
+      k -= 16 * sizeof(int8_t);
+    }
+
+    // Handle 8 bytes at a time using MUL.
+    if (k > 0) {
+      const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+
+      const int8x8_t vb0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x0 = vmull_s8(vb0, va0);
+      const int16x8_t vprod1x0 = vmull_s8(vb0, va1);
+      vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+      vacc1x0 = vpadalq_s16(vacc1x0, vprod1x0);
+      const int8x8_t vb1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x1 = vmull_s8(vb1, va0);
+      const int16x8_t vprod1x1 = vmull_s8(vb1, va1);
+      vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+      vacc1x1 = vpadalq_s16(vacc1x1, vprod1x1);
+      const int8x8_t vb2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x2 = vmull_s8(vb2, va0);
+      const int16x8_t vprod1x2 = vmull_s8(vb2, va1);
+      vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+      vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2);
+      const int8x8_t vb3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x3 = vmull_s8(vb3, va0);
+      const int16x8_t vprod1x3 = vmull_s8(vb3, va1);
+      vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+      vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3);
+      const int8x8_t vb4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x4 = vmull_s8(vb4, va0);
+      const int16x8_t vprod1x4 = vmull_s8(vb4, va1);
+      vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+      vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4);
+      const int8x8_t vb5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x5 = vmull_s8(vb5, va0);
+      const int16x8_t vprod1x5 = vmull_s8(vb5, va1);
+      vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+      vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5);
+      const int8x8_t vb6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x6 = vmull_s8(vb6, va0);
+      const int16x8_t vprod1x6 = vmull_s8(vb6, va1);
+      vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+      vacc1x6 = vpadalq_s16(vacc1x6, vprod1x6);
+      const int8x8_t vb7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x7 = vmull_s8(vb7, va0);
+      const int16x8_t vprod1x7 = vmull_s8(vb7, va1);
+      vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+      vacc1x7 = vpadalq_s16(vacc1x7, vprod1x7);
+
+      k -= 8 * sizeof(int8_t);
+    }
+
+#if XNN_ARCH_ARM64
+    const int32x4_t vsum0x01 = vpaddq_s32(vacc0x0, vacc0x1);
+    const int32x4_t vsum0x23 = vpaddq_s32(vacc0x2, vacc0x3);
+    const int32x4_t vsum0x45 = vpaddq_s32(vacc0x4, vacc0x5);
+    const int32x4_t vsum0x67 = vpaddq_s32(vacc0x6, vacc0x7);
+    const int32x4_t vsum1x01 = vpaddq_s32(vacc1x0, vacc1x1);
+    const int32x4_t vsum1x23 = vpaddq_s32(vacc1x2, vacc1x3);
+    const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5);
+    const int32x4_t vsum1x67 = vpaddq_s32(vacc1x6, vacc1x7);
+    int32x4_t vacc0x0123 = vpaddq_s32(vsum0x01, vsum0x23);
+    int32x4_t vacc0x4567 = vpaddq_s32(vsum0x45, vsum0x67);
+    int32x4_t vacc1x0123 = vpaddq_s32(vsum1x01, vsum1x23);
+    int32x4_t vacc1x4567 = vpaddq_s32(vsum1x45, vsum1x67);
+#else
+    const int32x2_t vpsum0x0 = vadd_s32(vget_low_s32(vacc0x0), vget_high_s32(vacc0x0));
+    const int32x2_t vpsum0x1 = vadd_s32(vget_low_s32(vacc0x1), vget_high_s32(vacc0x1));
+    const int32x2_t vpsum0x2 = vadd_s32(vget_low_s32(vacc0x2), vget_high_s32(vacc0x2));
+    const int32x2_t vpsum0x3 = vadd_s32(vget_low_s32(vacc0x3), vget_high_s32(vacc0x3));
+    const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1);
+    const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3);
+    int32x4_t vacc0x0123 = vcombine_s32(vsum0x01, vsum0x23 );
+    const int32x2_t vpsum0x4 = vadd_s32(vget_low_s32(vacc0x4), vget_high_s32(vacc0x4));
+    const int32x2_t vpsum0x5 = vadd_s32(vget_low_s32(vacc0x5), vget_high_s32(vacc0x5));
+    const int32x2_t vpsum0x6 = vadd_s32(vget_low_s32(vacc0x6), vget_high_s32(vacc0x6));
+    const int32x2_t vpsum0x7 = vadd_s32(vget_low_s32(vacc0x7), vget_high_s32(vacc0x7));
+    const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5);
+    const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7);
+    int32x4_t vacc0x4567 = vcombine_s32(vsum0x45, vsum0x67 );
+    const int32x2_t vpsum1x0 = vadd_s32(vget_low_s32(vacc1x0), vget_high_s32(vacc1x0));
+    const int32x2_t vpsum1x1 = vadd_s32(vget_low_s32(vacc1x1), vget_high_s32(vacc1x1));
+    const int32x2_t vpsum1x2 = vadd_s32(vget_low_s32(vacc1x2), vget_high_s32(vacc1x2));
+    const int32x2_t vpsum1x3 = vadd_s32(vget_low_s32(vacc1x3), vget_high_s32(vacc1x3));
+    const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1);
+    const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3);
+    int32x4_t vacc1x0123 = vcombine_s32(vsum1x01, vsum1x23 );
+    const int32x2_t vpsum1x4 = vadd_s32(vget_low_s32(vacc1x4), vget_high_s32(vacc1x4));
+    const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5));
+    const int32x2_t vpsum1x6 = vadd_s32(vget_low_s32(vacc1x6), vget_high_s32(vacc1x6));
+    const int32x2_t vpsum1x7 = vadd_s32(vget_low_s32(vacc1x7), vget_high_s32(vacc1x7));
+    const int32x2_t vsum1x45 = vpadd_s32(vpsum1x4, vpsum1x5);
+    const int32x2_t vsum1x67 = vpadd_s32(vpsum1x6, vpsum1x7);
+    int32x4_t vacc1x4567 = vcombine_s32(vsum1x45, vsum1x67 );
+#endif
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);
+
+    vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);
+
+    if (nc >= 8) {
+      vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
+      vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/2x8c8-minmax-neon-mull-padal.c b/src/qs8-gemm/gen/2x8c8-minmax-neon-mull-padal.c
new file mode 100644
index 0000000..c0aa72e
--- /dev/null
+++ b/src/qs8-gemm/gen/2x8c8-minmax-neon-mull-padal.c
@@ -0,0 +1,228 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/c8-neon-mull-padal.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mull_padal(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+
+  do {
+    int32x4_t vacc0x0 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x1 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x2 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x3 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x4 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x5 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc1x0 = vacc0x0;
+    int32x4_t vacc1x1 = vacc0x1;
+    int32x4_t vacc1x2 = vacc0x2;
+    int32x4_t vacc1x3 = vacc0x3;
+    int32x4_t vacc1x4 = vacc0x4;
+    int32x4_t vacc1x5 = vacc0x5;
+    int32x4_t vacc1x6 = vacc0x6;
+    int32x4_t vacc1x7 = vacc0x7;
+
+    size_t k = kc;
+
+    // Handle 8 bytes at a time using MUL.
+    while (k > 0) {
+      const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+
+      const int8x8_t vb0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x0 = vmull_s8(vb0, va0);
+      const int16x8_t vprod1x0 = vmull_s8(vb0, va1);
+      vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+      vacc1x0 = vpadalq_s16(vacc1x0, vprod1x0);
+      const int8x8_t vb1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x1 = vmull_s8(vb1, va0);
+      const int16x8_t vprod1x1 = vmull_s8(vb1, va1);
+      vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+      vacc1x1 = vpadalq_s16(vacc1x1, vprod1x1);
+      const int8x8_t vb2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x2 = vmull_s8(vb2, va0);
+      const int16x8_t vprod1x2 = vmull_s8(vb2, va1);
+      vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+      vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2);
+      const int8x8_t vb3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x3 = vmull_s8(vb3, va0);
+      const int16x8_t vprod1x3 = vmull_s8(vb3, va1);
+      vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+      vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3);
+      const int8x8_t vb4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x4 = vmull_s8(vb4, va0);
+      const int16x8_t vprod1x4 = vmull_s8(vb4, va1);
+      vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+      vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4);
+      const int8x8_t vb5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x5 = vmull_s8(vb5, va0);
+      const int16x8_t vprod1x5 = vmull_s8(vb5, va1);
+      vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+      vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5);
+      const int8x8_t vb6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x6 = vmull_s8(vb6, va0);
+      const int16x8_t vprod1x6 = vmull_s8(vb6, va1);
+      vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+      vacc1x6 = vpadalq_s16(vacc1x6, vprod1x6);
+      const int8x8_t vb7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x7 = vmull_s8(vb7, va0);
+      const int16x8_t vprod1x7 = vmull_s8(vb7, va1);
+      vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+      vacc1x7 = vpadalq_s16(vacc1x7, vprod1x7);
+
+      k -= 8 * sizeof(int8_t);
+    }
+
+#if XNN_ARCH_ARM64
+    const int32x4_t vsum0x01 = vpaddq_s32(vacc0x0, vacc0x1);
+    const int32x4_t vsum0x23 = vpaddq_s32(vacc0x2, vacc0x3);
+    const int32x4_t vsum0x45 = vpaddq_s32(vacc0x4, vacc0x5);
+    const int32x4_t vsum0x67 = vpaddq_s32(vacc0x6, vacc0x7);
+    const int32x4_t vsum1x01 = vpaddq_s32(vacc1x0, vacc1x1);
+    const int32x4_t vsum1x23 = vpaddq_s32(vacc1x2, vacc1x3);
+    const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5);
+    const int32x4_t vsum1x67 = vpaddq_s32(vacc1x6, vacc1x7);
+    int32x4_t vacc0x0123 = vpaddq_s32(vsum0x01, vsum0x23);
+    int32x4_t vacc0x4567 = vpaddq_s32(vsum0x45, vsum0x67);
+    int32x4_t vacc1x0123 = vpaddq_s32(vsum1x01, vsum1x23);
+    int32x4_t vacc1x4567 = vpaddq_s32(vsum1x45, vsum1x67);
+#else
+    const int32x2_t vpsum0x0 = vadd_s32(vget_low_s32(vacc0x0), vget_high_s32(vacc0x0));
+    const int32x2_t vpsum0x1 = vadd_s32(vget_low_s32(vacc0x1), vget_high_s32(vacc0x1));
+    const int32x2_t vpsum0x2 = vadd_s32(vget_low_s32(vacc0x2), vget_high_s32(vacc0x2));
+    const int32x2_t vpsum0x3 = vadd_s32(vget_low_s32(vacc0x3), vget_high_s32(vacc0x3));
+    const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1);
+    const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3);
+    int32x4_t vacc0x0123 = vcombine_s32(vsum0x01, vsum0x23 );
+    const int32x2_t vpsum0x4 = vadd_s32(vget_low_s32(vacc0x4), vget_high_s32(vacc0x4));
+    const int32x2_t vpsum0x5 = vadd_s32(vget_low_s32(vacc0x5), vget_high_s32(vacc0x5));
+    const int32x2_t vpsum0x6 = vadd_s32(vget_low_s32(vacc0x6), vget_high_s32(vacc0x6));
+    const int32x2_t vpsum0x7 = vadd_s32(vget_low_s32(vacc0x7), vget_high_s32(vacc0x7));
+    const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5);
+    const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7);
+    int32x4_t vacc0x4567 = vcombine_s32(vsum0x45, vsum0x67 );
+    const int32x2_t vpsum1x0 = vadd_s32(vget_low_s32(vacc1x0), vget_high_s32(vacc1x0));
+    const int32x2_t vpsum1x1 = vadd_s32(vget_low_s32(vacc1x1), vget_high_s32(vacc1x1));
+    const int32x2_t vpsum1x2 = vadd_s32(vget_low_s32(vacc1x2), vget_high_s32(vacc1x2));
+    const int32x2_t vpsum1x3 = vadd_s32(vget_low_s32(vacc1x3), vget_high_s32(vacc1x3));
+    const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1);
+    const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3);
+    int32x4_t vacc1x0123 = vcombine_s32(vsum1x01, vsum1x23 );
+    const int32x2_t vpsum1x4 = vadd_s32(vget_low_s32(vacc1x4), vget_high_s32(vacc1x4));
+    const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5));
+    const int32x2_t vpsum1x6 = vadd_s32(vget_low_s32(vacc1x6), vget_high_s32(vacc1x6));
+    const int32x2_t vpsum1x7 = vadd_s32(vget_low_s32(vacc1x7), vget_high_s32(vacc1x7));
+    const int32x2_t vsum1x45 = vpadd_s32(vpsum1x4, vpsum1x5);
+    const int32x2_t vsum1x67 = vpadd_s32(vpsum1x6, vpsum1x7);
+    int32x4_t vacc1x4567 = vcombine_s32(vsum1x45, vsum1x67 );
+#endif
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);
+
+    vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);
+
+    if (nc >= 8) {
+      vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
+      vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/2x8c8-xw-minmax-avx2.c b/src/qs8-gemm/gen/2x8c8-xw-minmax-avx2.c
index bc7a3b3..e228916 100644
--- a/src/qs8-gemm/gen/2x8c8-xw-minmax-avx2.c
+++ b/src/qs8-gemm/gen/2x8c8-xw-minmax-avx2.c
@@ -13,6 +13,7 @@
 
 #include <xnnpack/gemm.h>
 #include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2(
@@ -36,6 +37,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -156,12 +158,12 @@
       _mm_storel_epi64((__m128i*) c0, vout_lo);
       _mm_storel_epi64((__m128i*) c1, vout_hi);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
       nc -= 8;
     } else {
       if (nc & 4) {
diff --git a/src/qs8-gemm/gen/3x16-minmax-neon-mlal-lane.c b/src/qs8-gemm/gen/3x16-minmax-neon-mlal-lane.c
new file mode 100644
index 0000000..39aefca
--- /dev/null
+++ b/src/qs8-gemm/gen/3x16-minmax-neon-mlal-lane.c
@@ -0,0 +1,496 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/neon-mlal-lane.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/gemm.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc1x0123 = vacc0x0123;
+    int32x4_t vacc1x4567 = vacc0x4567;
+    int32x4_t vacc1x89AB = vacc0x89AB;
+    int32x4_t vacc1xCDEF = vacc0xCDEF;
+    int32x4_t vacc2x0123 = vacc0x0123;
+    int32x4_t vacc2x4567 = vacc0x4567;
+    int32x4_t vacc2x89AB = vacc0x89AB;
+    int32x4_t vacc2xCDEF = vacc0xCDEF;
+
+    size_t k = kc;
+    while (k >= 8 * sizeof(int8_t)) {
+      const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+      const int16x8_t vxa0 = vmovl_s8(va0);
+      const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+      const int16x8_t vxa1 = vmovl_s8(va1);
+      const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+      const int16x8_t vxa2 = vmovl_s8(va2);
+
+      const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb01234567c0 = vmovl_s8(vb01234567c0);
+
+      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
+      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
+      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
+      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
+      const int8x8_t vb89ABCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0);
+
+      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
+      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
+      vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0);
+      vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0);
+      vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0);
+      vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0);
+      const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb01234567c1 = vmovl_s8(vb01234567c1);
+
+      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
+      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
+      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
+      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
+      const int8x8_t vb89ABCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1);
+
+      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
+      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
+      vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1);
+      vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1);
+      vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1);
+      vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1);
+      const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb01234567c2 = vmovl_s8(vb01234567c2);
+
+      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
+      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
+      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
+      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
+      const int8x8_t vb89ABCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb89ABCDEFc2 = vmovl_s8(vb89ABCDEFc2);
+
+      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
+      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
+      vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa1), 2);
+      vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa1), 2);
+      vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2);
+      vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2);
+      const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb01234567c3 = vmovl_s8(vb01234567c3);
+
+      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
+      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
+      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
+      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
+      const int8x8_t vb89ABCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb89ABCDEFc3 = vmovl_s8(vb89ABCDEFc3);
+
+      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);
+      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);
+      vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa1), 3);
+      vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa1), 3);
+      vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3);
+      vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3);
+
+      const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb01234567c4 = vmovl_s8(vb01234567c4);
+
+      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
+      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
+      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
+      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
+      const int8x8_t vb89ABCDEFc4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb89ABCDEFc4 = vmovl_s8(vb89ABCDEFc4);
+
+      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
+      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
+      vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa1), 0);
+      vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa1), 0);
+      vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0);
+      vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0);
+      const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb01234567c5 = vmovl_s8(vb01234567c5);
+
+      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
+      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
+      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
+      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
+      const int8x8_t vb89ABCDEFc5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb89ABCDEFc5 = vmovl_s8(vb89ABCDEFc5);
+
+      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
+      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
+      vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa1), 1);
+      vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa1), 1);
+      vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1);
+      vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1);
+      const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb01234567c6 = vmovl_s8(vb01234567c6);
+
+      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
+      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
+      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
+      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
+      const int8x8_t vb89ABCDEFc6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb89ABCDEFc6 = vmovl_s8(vb89ABCDEFc6);
+
+      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
+      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
+      vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa1), 2);
+      vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa1), 2);
+      vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2);
+      vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2);
+      const int8x8_t vb01234567c7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb01234567c7 = vmovl_s8(vb01234567c7);
+
+      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
+      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
+      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa1), 3);
+      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa1), 3);
+      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa2), 3);
+      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa2), 3);
+      const int8x8_t vb89ABCDEFc7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb89ABCDEFc7 = vmovl_s8(vb89ABCDEFc7);
+
+      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3);
+      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3);
+      vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa1), 3);
+      vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa1), 3);
+      vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa2), 3);
+      vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa2), 3);
+
+      k -= 8 * sizeof(int8_t);
+    }
+    if XNN_UNLIKELY(k != 0) {
+      const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+      const int16x8_t vxa0 = vmovl_s8(va0);
+      const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+      const int16x8_t vxa1 = vmovl_s8(va1);
+      const int8x8_t va2 = vld1_s8(a2); a2 = (const int8_t*) ((uintptr_t) a2 + k);
+      const int16x8_t vxa2 = vmovl_s8(va2);
+
+      const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb01234567c0 = vmovl_s8(vb01234567c0);
+      const int8x8_t vb89ABCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0);
+
+      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
+      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
+      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
+      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
+      vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0);
+      vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0);
+      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
+      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
+      vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0);
+      vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0);
+
+      if (k >= 2 * sizeof(int8_t)) {
+        const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb01234567c1 = vmovl_s8(vb01234567c1);
+        const int8x8_t vb89ABCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1);
+
+        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+        vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
+        vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
+        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
+        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
+        vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1);
+        vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1);
+        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
+        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
+        vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1);
+        vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1);
+
+        if (k > 2 * sizeof(int8_t)) {
+          const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int16x8_t vxb01234567c2 = vmovl_s8(vb01234567c2);
+          const int8x8_t vb89ABCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int16x8_t vxb89ABCDEFc2 = vmovl_s8(vb89ABCDEFc2);
+
+          vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+          vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+          vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
+          vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
+          vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
+          vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
+          vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa1), 2);
+          vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa1), 2);
+          vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
+          vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
+          vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2);
+          vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2);
+
+          if (k >= 4 * sizeof(int8_t)) {
+            const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            const int16x8_t vxb01234567c3 = vmovl_s8(vb01234567c3);
+            const int8x8_t vb89ABCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            const int16x8_t vxb89ABCDEFc3 = vmovl_s8(vb89ABCDEFc3);
+
+            vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+            vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+            vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);
+            vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);
+            vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
+            vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
+            vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa1), 3);
+            vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa1), 3);
+            vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
+            vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
+            vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3);
+            vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3);
+
+            if (k > 4 * sizeof(int8_t)) {
+              const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+              const int16x8_t vxb01234567c4 = vmovl_s8(vb01234567c4);
+              const int8x8_t vb89ABCDEFc4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+              const int16x8_t vxb89ABCDEFc4 = vmovl_s8(vb89ABCDEFc4);
+
+              vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+              vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+              vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
+              vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
+              vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
+              vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
+              vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa1), 0);
+              vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa1), 0);
+              vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
+              vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
+              vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0);
+              vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0);
+
+              if (k >= 6 * sizeof(int8_t)) {
+                const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+                const int16x8_t vxb01234567c5 = vmovl_s8(vb01234567c5);
+                const int8x8_t vb89ABCDEFc5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+                const int16x8_t vxb89ABCDEFc5 = vmovl_s8(vb89ABCDEFc5);
+
+                vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+                vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+                vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
+                vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
+                vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
+                vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
+                vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa1), 1);
+                vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa1), 1);
+                vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
+                vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
+                vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1);
+                vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1);
+
+                if (k > 6 * sizeof(int8_t)) {
+                  const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+                  const int16x8_t vxb01234567c6 = vmovl_s8(vb01234567c6);
+                  const int8x8_t vb89ABCDEFc6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+                  const int16x8_t vxb89ABCDEFc6 = vmovl_s8(vb89ABCDEFc6);
+
+                  vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+                  vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+                  vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
+                  vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
+                  vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
+                  vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
+                  vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa1), 2);
+                  vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa1), 2);
+                  vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
+                  vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
+                  vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2);
+                  vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+    vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc1x89AB = vqrdmulhq_s32(vacc1x89AB, vmultiplier);
+    vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+    vacc2x89AB = vqrdmulhq_s32(vacc2x89AB, vmultiplier);
+    vacc2xCDEF = vqrdmulhq_s32(vacc2xCDEF, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+    vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc1x89AB = vsraq_n_s32(vacc1x89AB, vbicq_s32(vacc1x89AB, vzero_shift_mask), 31);
+    vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+    vacc2x89AB = vsraq_n_s32(vacc2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31);
+    vacc2xCDEF = vsraq_n_s32(vacc2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_shift);
+    vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+    vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_shift);
+    vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+    int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
+    int8x16_t vout2x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc2x89ABCDEF);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+    int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
+    int8x16_t vout2x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc2x89ABCDEF));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+    vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
+    vout2x0123456789ABCDEF = vmaxq_s8(vout2x0123456789ABCDEF, voutput_min);
+
+    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+    vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
+    vout2x0123456789ABCDEF = vminq_s8(vout2x0123456789ABCDEF, voutput_max);
+
+    if (nc >= 16) {
+      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+      vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
+      vst1q_s8(c2 + 0, vout2x0123456789ABCDEF);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
+      nc -= 16;
+    } else {
+      int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
+      int8x8_t vout2x01234567 = vget_low_s8(vout2x0123456789ABCDEF);
+      if (nc & 8) {
+        vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
+        vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
+        vst1_s8(c2, vout2x01234567); c2 += 8;
+        vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
+        vout2x01234567 = vget_high_s8(vout2x0123456789ABCDEF);
+      }
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1_lane_s8(c2, vout2x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/3x16-minmax-neon-mull-addw-dup.c b/src/qs8-gemm/gen/3x16-minmax-neon-mull-addw-dup.c
new file mode 100644
index 0000000..84b77b5
--- /dev/null
+++ b/src/qs8-gemm/gen/3x16-minmax-neon-mull-addw-dup.c
@@ -0,0 +1,548 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/neon-mull-addw-dup.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/gemm.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc1x0123 = vacc0x0123;
+    int32x4_t vacc1x4567 = vacc0x4567;
+    int32x4_t vacc1x89AB = vacc0x89AB;
+    int32x4_t vacc1xCDEF = vacc0xCDEF;
+    int32x4_t vacc2x0123 = vacc0x0123;
+    int32x4_t vacc2x4567 = vacc0x4567;
+    int32x4_t vacc2x89AB = vacc0x89AB;
+    int32x4_t vacc2xCDEF = vacc0xCDEF;
+
+    size_t k = kc;
+    while (k >= 8 * sizeof(int8_t)) {
+      const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+      const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+
+      const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c0));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c0));
+      const int16x8_t vprod1x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va1, 0));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c0));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c0));
+      const int16x8_t vprod2x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va2, 0));
+      vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c0));
+      vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c0));
+      const int8x8_t vb89ABCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va0, 0));
+      vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc0));
+      vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc0));
+      const int16x8_t vprod1x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va1, 0));
+      vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc0));
+      vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc0));
+      const int16x8_t vprod2x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va2, 0));
+      vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc0));
+      vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc0));
+      const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c1));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c1));
+      const int16x8_t vprod1x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va1, 1));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c1));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c1));
+      const int16x8_t vprod2x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va2, 1));
+      vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c1));
+      vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c1));
+      const int8x8_t vb89ABCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va0, 1));
+      vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc1));
+      vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc1));
+      const int16x8_t vprod1x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va1, 1));
+      vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc1));
+      vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc1));
+      const int16x8_t vprod2x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va2, 1));
+      vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc1));
+      vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc1));
+      const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va0, 2));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c2));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c2));
+      const int16x8_t vprod1x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va1, 2));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c2));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c2));
+      const int16x8_t vprod2x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va2, 2));
+      vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c2));
+      vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c2));
+      const int8x8_t vb89ABCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va0, 2));
+      vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc2));
+      vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc2));
+      const int16x8_t vprod1x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va1, 2));
+      vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc2));
+      vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc2));
+      const int16x8_t vprod2x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va2, 2));
+      vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc2));
+      vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc2));
+      const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3));
+      const int16x8_t vprod1x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va1, 3));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c3));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c3));
+      const int16x8_t vprod2x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va2, 3));
+      vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c3));
+      vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c3));
+      const int8x8_t vb89ABCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va0, 3));
+      vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc3));
+      vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc3));
+      const int16x8_t vprod1x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va1, 3));
+      vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc3));
+      vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc3));
+      const int16x8_t vprod2x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va2, 3));
+      vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc3));
+      vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc3));
+      const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va0, 4));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c4));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c4));
+      const int16x8_t vprod1x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va1, 4));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c4));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c4));
+      const int16x8_t vprod2x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va2, 4));
+      vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c4));
+      vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c4));
+      const int8x8_t vb89ABCDEFc4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va0, 4));
+      vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc4));
+      vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc4));
+      const int16x8_t vprod1x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va1, 4));
+      vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc4));
+      vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc4));
+      const int16x8_t vprod2x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va2, 4));
+      vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc4));
+      vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc4));
+      const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va0, 5));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c5));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c5));
+      const int16x8_t vprod1x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va1, 5));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c5));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c5));
+      const int16x8_t vprod2x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va2, 5));
+      vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c5));
+      vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c5));
+      const int8x8_t vb89ABCDEFc5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va0, 5));
+      vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc5));
+      vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc5));
+      const int16x8_t vprod1x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va1, 5));
+      vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc5));
+      vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc5));
+      const int16x8_t vprod2x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va2, 5));
+      vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc5));
+      vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc5));
+      const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va0, 6));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c6));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c6));
+      const int16x8_t vprod1x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va1, 6));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c6));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c6));
+      const int16x8_t vprod2x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va2, 6));
+      vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c6));
+      vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c6));
+      const int8x8_t vb89ABCDEFc6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va0, 6));
+      vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc6));
+      vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc6));
+      const int16x8_t vprod1x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va1, 6));
+      vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc6));
+      vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc6));
+      const int16x8_t vprod2x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va2, 6));
+      vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc6));
+      vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc6));
+      const int8x8_t vb01234567c7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va0, 7));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c7));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c7));
+      const int16x8_t vprod1x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va1, 7));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c7));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c7));
+      const int16x8_t vprod2x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va2, 7));
+      vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c7));
+      vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c7));
+      const int8x8_t vb89ABCDEFc7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x89ABCDEFc7 = vmull_s8(vb89ABCDEFc7, vdup_lane_s8(va0, 7));
+      vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc7));
+      vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc7));
+      const int16x8_t vprod1x89ABCDEFc7 = vmull_s8(vb89ABCDEFc7, vdup_lane_s8(va1, 7));
+      vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc7));
+      vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc7));
+      const int16x8_t vprod2x89ABCDEFc7 = vmull_s8(vb89ABCDEFc7, vdup_lane_s8(va2, 7));
+      vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc7));
+      vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc7));
+
+      k -= 8 * sizeof(int8_t);
+    }
+    if XNN_UNLIKELY(k != 0) {
+      const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+      const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+      const int8x8_t va2 = vld1_s8(a2); a2 = (const int8_t*) ((uintptr_t) a2 + k);
+
+      const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c0));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c0));
+      const int16x8_t vprod0x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va0, 0));
+      vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc0));
+      vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc0));
+      const int16x8_t vprod1x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va1, 0));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c0));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c0));
+      const int16x8_t vprod1x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va1, 0));
+      vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc0));
+      vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc0));
+      const int16x8_t vprod2x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va2, 0));
+      vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c0));
+      vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c0));
+      const int16x8_t vprod2x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va2, 0));
+      vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc0));
+      vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc0));
+
+      if (k >= 2 * sizeof(int8_t)) {
+        const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c1));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c1));
+        const int16x8_t vprod0x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va0, 1));
+        vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc1));
+        vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc1));
+        const int16x8_t vprod1x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va1, 1));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c1));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c1));
+        const int16x8_t vprod1x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va1, 1));
+        vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc1));
+        vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc1));
+        const int16x8_t vprod2x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va2, 1));
+        vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c1));
+        vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c1));
+        const int16x8_t vprod2x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va2, 1));
+        vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc1));
+        vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc1));
+
+        if (k > 2 * sizeof(int8_t)) {
+          const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb89ABCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          const int16x8_t vprod0x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va0, 2));
+          vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c2));
+          vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c2));
+          const int16x8_t vprod0x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va0, 2));
+          vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc2));
+          vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc2));
+          const int16x8_t vprod1x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va1, 2));
+          vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c2));
+          vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c2));
+          const int16x8_t vprod1x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va1, 2));
+          vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc2));
+          vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc2));
+          const int16x8_t vprod2x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va2, 2));
+          vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c2));
+          vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c2));
+          const int16x8_t vprod2x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va2, 2));
+          vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc2));
+          vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc2));
+
+          if (k >= 4 * sizeof(int8_t)) {
+            const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            const int8x8_t vb89ABCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+            const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3));
+            vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3));
+            vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3));
+            const int16x8_t vprod0x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va0, 3));
+            vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc3));
+            vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc3));
+            const int16x8_t vprod1x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va1, 3));
+            vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c3));
+            vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c3));
+            const int16x8_t vprod1x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va1, 3));
+            vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc3));
+            vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc3));
+            const int16x8_t vprod2x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va2, 3));
+            vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c3));
+            vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c3));
+            const int16x8_t vprod2x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va2, 3));
+            vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc3));
+            vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc3));
+
+            if (k > 4 * sizeof(int8_t)) {
+              const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+              const int8x8_t vb89ABCDEFc4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+              const int16x8_t vprod0x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va0, 4));
+              vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c4));
+              vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c4));
+              const int16x8_t vprod0x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va0, 4));
+              vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc4));
+              vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc4));
+              const int16x8_t vprod1x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va1, 4));
+              vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c4));
+              vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c4));
+              const int16x8_t vprod1x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va1, 4));
+              vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc4));
+              vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc4));
+              const int16x8_t vprod2x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va2, 4));
+              vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c4));
+              vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c4));
+              const int16x8_t vprod2x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va2, 4));
+              vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc4));
+              vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc4));
+
+              if (k >= 6 * sizeof(int8_t)) {
+                const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+                const int8x8_t vb89ABCDEFc5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+                const int16x8_t vprod0x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va0, 5));
+                vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c5));
+                vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c5));
+                const int16x8_t vprod0x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va0, 5));
+                vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc5));
+                vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc5));
+                const int16x8_t vprod1x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va1, 5));
+                vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c5));
+                vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c5));
+                const int16x8_t vprod1x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va1, 5));
+                vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc5));
+                vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc5));
+                const int16x8_t vprod2x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va2, 5));
+                vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c5));
+                vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c5));
+                const int16x8_t vprod2x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va2, 5));
+                vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc5));
+                vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc5));
+
+                if (k > 6 * sizeof(int8_t)) {
+                  const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+                  const int8x8_t vb89ABCDEFc6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+                  const int16x8_t vprod0x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va0, 6));
+                  vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c6));
+                  vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c6));
+                  const int16x8_t vprod0x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va0, 6));
+                  vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc6));
+                  vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc6));
+                  const int16x8_t vprod1x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va1, 6));
+                  vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c6));
+                  vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c6));
+                  const int16x8_t vprod1x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va1, 6));
+                  vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc6));
+                  vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc6));
+                  const int16x8_t vprod2x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va2, 6));
+                  vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c6));
+                  vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c6));
+                  const int16x8_t vprod2x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va2, 6));
+                  vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc6));
+                  vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc6));
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+    vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc1x89AB = vqrdmulhq_s32(vacc1x89AB, vmultiplier);
+    vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+    vacc2x89AB = vqrdmulhq_s32(vacc2x89AB, vmultiplier);
+    vacc2xCDEF = vqrdmulhq_s32(vacc2xCDEF, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+    vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc1x89AB = vsraq_n_s32(vacc1x89AB, vbicq_s32(vacc1x89AB, vzero_shift_mask), 31);
+    vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+    vacc2x89AB = vsraq_n_s32(vacc2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31);
+    vacc2xCDEF = vsraq_n_s32(vacc2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_shift);
+    vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+    vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_shift);
+    vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+    int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
+    int8x16_t vout2x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc2x89ABCDEF);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+    int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
+    int8x16_t vout2x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc2x89ABCDEF));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+    vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
+    vout2x0123456789ABCDEF = vmaxq_s8(vout2x0123456789ABCDEF, voutput_min);
+
+    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+    vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
+    vout2x0123456789ABCDEF = vminq_s8(vout2x0123456789ABCDEF, voutput_max);
+
+    if (nc >= 16) {
+      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+      vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
+      vst1q_s8(c2 + 0, vout2x0123456789ABCDEF);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
+      nc -= 16;
+    } else {
+      int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
+      int8x8_t vout2x01234567 = vget_low_s8(vout2x0123456789ABCDEF);
+      if (nc & 8) {
+        vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
+        vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
+        vst1_s8(c2, vout2x01234567); c2 += 8;
+        vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
+        vout2x01234567 = vget_high_s8(vout2x0123456789ABCDEF);
+      }
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1_lane_s8(c2, vout2x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/3x16c16-minmax-neon-mlal-padal.c b/src/qs8-gemm/gen/3x16c16-minmax-neon-mlal-padal.c
new file mode 100644
index 0000000..8dd6360
--- /dev/null
+++ b/src/qs8-gemm/gen/3x16c16-minmax-neon-mlal-padal.c
@@ -0,0 +1,524 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/c16-neon-mlal-padal.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 16);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+
+  do {
+    int32x4_t vacc0x0 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x1 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x2 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x3 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x4 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x5 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x8 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x9 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x10 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x11 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x12 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x13 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x14 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x15 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc1x0 = vacc0x0;
+    int32x4_t vacc1x1 = vacc0x1;
+    int32x4_t vacc1x2 = vacc0x2;
+    int32x4_t vacc1x3 = vacc0x3;
+    int32x4_t vacc1x4 = vacc0x4;
+    int32x4_t vacc1x5 = vacc0x5;
+    int32x4_t vacc1x6 = vacc0x6;
+    int32x4_t vacc1x7 = vacc0x7;
+    int32x4_t vacc1x8 = vacc0x8;
+    int32x4_t vacc1x9 = vacc0x9;
+    int32x4_t vacc1x10 = vacc0x10;
+    int32x4_t vacc1x11 = vacc0x11;
+    int32x4_t vacc1x12 = vacc0x12;
+    int32x4_t vacc1x13 = vacc0x13;
+    int32x4_t vacc1x14 = vacc0x14;
+    int32x4_t vacc1x15 = vacc0x15;
+    int32x4_t vacc2x0 = vacc0x0;
+    int32x4_t vacc2x1 = vacc0x1;
+    int32x4_t vacc2x2 = vacc0x2;
+    int32x4_t vacc2x3 = vacc0x3;
+    int32x4_t vacc2x4 = vacc0x4;
+    int32x4_t vacc2x5 = vacc0x5;
+    int32x4_t vacc2x6 = vacc0x6;
+    int32x4_t vacc2x7 = vacc0x7;
+    int32x4_t vacc2x8 = vacc0x8;
+    int32x4_t vacc2x9 = vacc0x9;
+    int32x4_t vacc2x10 = vacc0x10;
+    int32x4_t vacc2x11 = vacc0x11;
+    int32x4_t vacc2x12 = vacc0x12;
+    int32x4_t vacc2x13 = vacc0x13;
+    int32x4_t vacc2x14 = vacc0x14;
+    int32x4_t vacc2x15 = vacc0x15;
+
+    // KC loop of 16
+    size_t k = 0;
+    while (k < kc) {
+      const int8x16_t va0 = vld1q_s8(a0); a0 += 16;
+      const int8x16_t va1 = vld1q_s8(a1); a1 += 16;
+      const int8x16_t va2 = vld1q_s8(a2); a2 += 16;
+
+      const int8x16_t vb0 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb1 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb2 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb3 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb4 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb5 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb6 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb7 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb8 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb9 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb10 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb11 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb12 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb13 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb14 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb15 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+
+      int16x8_t vprod0x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va0));
+      int16x8_t vprod1x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va1));
+      int16x8_t vprod2x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va2));
+      vprod0x0 = vmlal_s8(vprod0x0, vget_high_s8(vb0), vget_high_s8(va0));
+      vprod1x0 = vmlal_s8(vprod1x0, vget_high_s8(vb0), vget_high_s8(va1));
+      vprod2x0 = vmlal_s8(vprod2x0, vget_high_s8(vb0), vget_high_s8(va2));
+      vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+      vacc1x0 = vpadalq_s16(vacc1x0, vprod1x0);
+      vacc2x0 = vpadalq_s16(vacc2x0, vprod2x0);
+      int16x8_t vprod0x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va0));
+      int16x8_t vprod1x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va1));
+      int16x8_t vprod2x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va2));
+      vprod0x1 = vmlal_s8(vprod0x1, vget_high_s8(vb1), vget_high_s8(va0));
+      vprod1x1 = vmlal_s8(vprod1x1, vget_high_s8(vb1), vget_high_s8(va1));
+      vprod2x1 = vmlal_s8(vprod2x1, vget_high_s8(vb1), vget_high_s8(va2));
+      vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+      vacc1x1 = vpadalq_s16(vacc1x1, vprod1x1);
+      vacc2x1 = vpadalq_s16(vacc2x1, vprod2x1);
+      int16x8_t vprod0x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va0));
+      int16x8_t vprod1x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va1));
+      int16x8_t vprod2x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va2));
+      vprod0x2 = vmlal_s8(vprod0x2, vget_high_s8(vb2), vget_high_s8(va0));
+      vprod1x2 = vmlal_s8(vprod1x2, vget_high_s8(vb2), vget_high_s8(va1));
+      vprod2x2 = vmlal_s8(vprod2x2, vget_high_s8(vb2), vget_high_s8(va2));
+      vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+      vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2);
+      vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2);
+      int16x8_t vprod0x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va0));
+      int16x8_t vprod1x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va1));
+      int16x8_t vprod2x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va2));
+      vprod0x3 = vmlal_s8(vprod0x3, vget_high_s8(vb3), vget_high_s8(va0));
+      vprod1x3 = vmlal_s8(vprod1x3, vget_high_s8(vb3), vget_high_s8(va1));
+      vprod2x3 = vmlal_s8(vprod2x3, vget_high_s8(vb3), vget_high_s8(va2));
+      vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+      vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3);
+      vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3);
+      int16x8_t vprod0x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va0));
+      int16x8_t vprod1x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va1));
+      int16x8_t vprod2x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va2));
+      vprod0x4 = vmlal_s8(vprod0x4, vget_high_s8(vb4), vget_high_s8(va0));
+      vprod1x4 = vmlal_s8(vprod1x4, vget_high_s8(vb4), vget_high_s8(va1));
+      vprod2x4 = vmlal_s8(vprod2x4, vget_high_s8(vb4), vget_high_s8(va2));
+      vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+      vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4);
+      vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4);
+      int16x8_t vprod0x5 = vmull_s8(vget_low_s8(vb5), vget_low_s8(va0));
+      int16x8_t vprod1x5 = vmull_s8(vget_low_s8(vb5), vget_low_s8(va1));
+      int16x8_t vprod2x5 = vmull_s8(vget_low_s8(vb5), vget_low_s8(va2));
+      vprod0x5 = vmlal_s8(vprod0x5, vget_high_s8(vb5), vget_high_s8(va0));
+      vprod1x5 = vmlal_s8(vprod1x5, vget_high_s8(vb5), vget_high_s8(va1));
+      vprod2x5 = vmlal_s8(vprod2x5, vget_high_s8(vb5), vget_high_s8(va2));
+      vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+      vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5);
+      vacc2x5 = vpadalq_s16(vacc2x5, vprod2x5);
+      int16x8_t vprod0x6 = vmull_s8(vget_low_s8(vb6), vget_low_s8(va0));
+      int16x8_t vprod1x6 = vmull_s8(vget_low_s8(vb6), vget_low_s8(va1));
+      int16x8_t vprod2x6 = vmull_s8(vget_low_s8(vb6), vget_low_s8(va2));
+      vprod0x6 = vmlal_s8(vprod0x6, vget_high_s8(vb6), vget_high_s8(va0));
+      vprod1x6 = vmlal_s8(vprod1x6, vget_high_s8(vb6), vget_high_s8(va1));
+      vprod2x6 = vmlal_s8(vprod2x6, vget_high_s8(vb6), vget_high_s8(va2));
+      vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+      vacc1x6 = vpadalq_s16(vacc1x6, vprod1x6);
+      vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6);
+      int16x8_t vprod0x7 = vmull_s8(vget_low_s8(vb7), vget_low_s8(va0));
+      int16x8_t vprod1x7 = vmull_s8(vget_low_s8(vb7), vget_low_s8(va1));
+      int16x8_t vprod2x7 = vmull_s8(vget_low_s8(vb7), vget_low_s8(va2));
+      vprod0x7 = vmlal_s8(vprod0x7, vget_high_s8(vb7), vget_high_s8(va0));
+      vprod1x7 = vmlal_s8(vprod1x7, vget_high_s8(vb7), vget_high_s8(va1));
+      vprod2x7 = vmlal_s8(vprod2x7, vget_high_s8(vb7), vget_high_s8(va2));
+      vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+      vacc1x7 = vpadalq_s16(vacc1x7, vprod1x7);
+      vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7);
+      int16x8_t vprod0x8 = vmull_s8(vget_low_s8(vb8), vget_low_s8(va0));
+      int16x8_t vprod1x8 = vmull_s8(vget_low_s8(vb8), vget_low_s8(va1));
+      int16x8_t vprod2x8 = vmull_s8(vget_low_s8(vb8), vget_low_s8(va2));
+      vprod0x8 = vmlal_s8(vprod0x8, vget_high_s8(vb8), vget_high_s8(va0));
+      vprod1x8 = vmlal_s8(vprod1x8, vget_high_s8(vb8), vget_high_s8(va1));
+      vprod2x8 = vmlal_s8(vprod2x8, vget_high_s8(vb8), vget_high_s8(va2));
+      vacc0x8 = vpadalq_s16(vacc0x8, vprod0x8);
+      vacc1x8 = vpadalq_s16(vacc1x8, vprod1x8);
+      vacc2x8 = vpadalq_s16(vacc2x8, vprod2x8);
+      int16x8_t vprod0x9 = vmull_s8(vget_low_s8(vb9), vget_low_s8(va0));
+      int16x8_t vprod1x9 = vmull_s8(vget_low_s8(vb9), vget_low_s8(va1));
+      int16x8_t vprod2x9 = vmull_s8(vget_low_s8(vb9), vget_low_s8(va2));
+      vprod0x9 = vmlal_s8(vprod0x9, vget_high_s8(vb9), vget_high_s8(va0));
+      vprod1x9 = vmlal_s8(vprod1x9, vget_high_s8(vb9), vget_high_s8(va1));
+      vprod2x9 = vmlal_s8(vprod2x9, vget_high_s8(vb9), vget_high_s8(va2));
+      vacc0x9 = vpadalq_s16(vacc0x9, vprod0x9);
+      vacc1x9 = vpadalq_s16(vacc1x9, vprod1x9);
+      vacc2x9 = vpadalq_s16(vacc2x9, vprod2x9);
+      int16x8_t vprod0x10 = vmull_s8(vget_low_s8(vb10), vget_low_s8(va0));
+      int16x8_t vprod1x10 = vmull_s8(vget_low_s8(vb10), vget_low_s8(va1));
+      int16x8_t vprod2x10 = vmull_s8(vget_low_s8(vb10), vget_low_s8(va2));
+      vprod0x10 = vmlal_s8(vprod0x10, vget_high_s8(vb10), vget_high_s8(va0));
+      vprod1x10 = vmlal_s8(vprod1x10, vget_high_s8(vb10), vget_high_s8(va1));
+      vprod2x10 = vmlal_s8(vprod2x10, vget_high_s8(vb10), vget_high_s8(va2));
+      vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10);
+      vacc1x10 = vpadalq_s16(vacc1x10, vprod1x10);
+      vacc2x10 = vpadalq_s16(vacc2x10, vprod2x10);
+      int16x8_t vprod0x11 = vmull_s8(vget_low_s8(vb11), vget_low_s8(va0));
+      int16x8_t vprod1x11 = vmull_s8(vget_low_s8(vb11), vget_low_s8(va1));
+      int16x8_t vprod2x11 = vmull_s8(vget_low_s8(vb11), vget_low_s8(va2));
+      vprod0x11 = vmlal_s8(vprod0x11, vget_high_s8(vb11), vget_high_s8(va0));
+      vprod1x11 = vmlal_s8(vprod1x11, vget_high_s8(vb11), vget_high_s8(va1));
+      vprod2x11 = vmlal_s8(vprod2x11, vget_high_s8(vb11), vget_high_s8(va2));
+      vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11);
+      vacc1x11 = vpadalq_s16(vacc1x11, vprod1x11);
+      vacc2x11 = vpadalq_s16(vacc2x11, vprod2x11);
+      int16x8_t vprod0x12 = vmull_s8(vget_low_s8(vb12), vget_low_s8(va0));
+      int16x8_t vprod1x12 = vmull_s8(vget_low_s8(vb12), vget_low_s8(va1));
+      int16x8_t vprod2x12 = vmull_s8(vget_low_s8(vb12), vget_low_s8(va2));
+      vprod0x12 = vmlal_s8(vprod0x12, vget_high_s8(vb12), vget_high_s8(va0));
+      vprod1x12 = vmlal_s8(vprod1x12, vget_high_s8(vb12), vget_high_s8(va1));
+      vprod2x12 = vmlal_s8(vprod2x12, vget_high_s8(vb12), vget_high_s8(va2));
+      vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12);
+      vacc1x12 = vpadalq_s16(vacc1x12, vprod1x12);
+      vacc2x12 = vpadalq_s16(vacc2x12, vprod2x12);
+      int16x8_t vprod0x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va0));
+      int16x8_t vprod1x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va1));
+      int16x8_t vprod2x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va2));
+      vprod0x13 = vmlal_s8(vprod0x13, vget_high_s8(vb13), vget_high_s8(va0));
+      vprod1x13 = vmlal_s8(vprod1x13, vget_high_s8(vb13), vget_high_s8(va1));
+      vprod2x13 = vmlal_s8(vprod2x13, vget_high_s8(vb13), vget_high_s8(va2));
+      vacc0x13 = vpadalq_s16(vacc0x13, vprod0x13);
+      vacc1x13 = vpadalq_s16(vacc1x13, vprod1x13);
+      vacc2x13 = vpadalq_s16(vacc2x13, vprod2x13);
+      int16x8_t vprod0x14 = vmull_s8(vget_low_s8(vb14), vget_low_s8(va0));
+      int16x8_t vprod1x14 = vmull_s8(vget_low_s8(vb14), vget_low_s8(va1));
+      int16x8_t vprod2x14 = vmull_s8(vget_low_s8(vb14), vget_low_s8(va2));
+      vprod0x14 = vmlal_s8(vprod0x14, vget_high_s8(vb14), vget_high_s8(va0));
+      vprod1x14 = vmlal_s8(vprod1x14, vget_high_s8(vb14), vget_high_s8(va1));
+      vprod2x14 = vmlal_s8(vprod2x14, vget_high_s8(vb14), vget_high_s8(va2));
+      vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14);
+      vacc1x14 = vpadalq_s16(vacc1x14, vprod1x14);
+      vacc2x14 = vpadalq_s16(vacc2x14, vprod2x14);
+      int16x8_t vprod0x15 = vmull_s8(vget_low_s8(vb15), vget_low_s8(va0));
+      int16x8_t vprod1x15 = vmull_s8(vget_low_s8(vb15), vget_low_s8(va1));
+      int16x8_t vprod2x15 = vmull_s8(vget_low_s8(vb15), vget_low_s8(va2));
+      vprod0x15 = vmlal_s8(vprod0x15, vget_high_s8(vb15), vget_high_s8(va0));
+      vprod1x15 = vmlal_s8(vprod1x15, vget_high_s8(vb15), vget_high_s8(va1));
+      vprod2x15 = vmlal_s8(vprod2x15, vget_high_s8(vb15), vget_high_s8(va2));
+      vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15);
+      vacc1x15 = vpadalq_s16(vacc1x15, vprod1x15);
+      vacc2x15 = vpadalq_s16(vacc2x15, vprod2x15);
+
+      k += 16 * sizeof(int8_t);
+    }
+
+#if XNN_ARCH_ARM64
+    const int32x4_t vsum0x01 = vpaddq_s32(vacc0x0, vacc0x1);
+    const int32x4_t vsum0x23 = vpaddq_s32(vacc0x2, vacc0x3);
+    const int32x4_t vsum0x45 = vpaddq_s32(vacc0x4, vacc0x5);
+    const int32x4_t vsum0x67 = vpaddq_s32(vacc0x6, vacc0x7);
+    const int32x4_t vsum0x89 = vpaddq_s32(vacc0x8, vacc0x9);
+    const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11);
+    const int32x4_t vsum0xCD = vpaddq_s32(vacc0x12, vacc0x13);
+    const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15);
+    const int32x4_t vsum1x01 = vpaddq_s32(vacc1x0, vacc1x1);
+    const int32x4_t vsum1x23 = vpaddq_s32(vacc1x2, vacc1x3);
+    const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5);
+    const int32x4_t vsum1x67 = vpaddq_s32(vacc1x6, vacc1x7);
+    const int32x4_t vsum1x89 = vpaddq_s32(vacc1x8, vacc1x9);
+    const int32x4_t vsum1xAB = vpaddq_s32(vacc1x10, vacc1x11);
+    const int32x4_t vsum1xCD = vpaddq_s32(vacc1x12, vacc1x13);
+    const int32x4_t vsum1xEF = vpaddq_s32(vacc1x14, vacc1x15);
+    const int32x4_t vsum2x01 = vpaddq_s32(vacc2x0, vacc2x1);
+    const int32x4_t vsum2x23 = vpaddq_s32(vacc2x2, vacc2x3);
+    const int32x4_t vsum2x45 = vpaddq_s32(vacc2x4, vacc2x5);
+    const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7);
+    const int32x4_t vsum2x89 = vpaddq_s32(vacc2x8, vacc2x9);
+    const int32x4_t vsum2xAB = vpaddq_s32(vacc2x10, vacc2x11);
+    const int32x4_t vsum2xCD = vpaddq_s32(vacc2x12, vacc2x13);
+    const int32x4_t vsum2xEF = vpaddq_s32(vacc2x14, vacc2x15);
+    int32x4_t vacc0x0123 = vpaddq_s32(vsum0x01, vsum0x23);
+    int32x4_t vacc0x4567 = vpaddq_s32(vsum0x45, vsum0x67);
+    int32x4_t vacc0x89AB = vpaddq_s32(vsum0x89, vsum0xAB);
+    int32x4_t vacc0xCDEF = vpaddq_s32(vsum0xCD, vsum0xEF);
+    int32x4_t vacc1x0123 = vpaddq_s32(vsum1x01, vsum1x23);
+    int32x4_t vacc1x4567 = vpaddq_s32(vsum1x45, vsum1x67);
+    int32x4_t vacc1x89AB = vpaddq_s32(vsum1x89, vsum1xAB);
+    int32x4_t vacc1xCDEF = vpaddq_s32(vsum1xCD, vsum1xEF);
+    int32x4_t vacc2x0123 = vpaddq_s32(vsum2x01, vsum2x23);
+    int32x4_t vacc2x4567 = vpaddq_s32(vsum2x45, vsum2x67);
+    int32x4_t vacc2x89AB = vpaddq_s32(vsum2x89, vsum2xAB);
+    int32x4_t vacc2xCDEF = vpaddq_s32(vsum2xCD, vsum2xEF);
+#else
+    const int32x2_t vpsum0x0 = vadd_s32(vget_low_s32(vacc0x0), vget_high_s32(vacc0x0));
+    const int32x2_t vpsum0x1 = vadd_s32(vget_low_s32(vacc0x1), vget_high_s32(vacc0x1));
+    const int32x2_t vpsum0x2 = vadd_s32(vget_low_s32(vacc0x2), vget_high_s32(vacc0x2));
+    const int32x2_t vpsum0x3 = vadd_s32(vget_low_s32(vacc0x3), vget_high_s32(vacc0x3));
+    const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1);
+    const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3);
+    int32x4_t vacc0x0123 = vcombine_s32(vsum0x01, vsum0x23 );
+    const int32x2_t vpsum0x4 = vadd_s32(vget_low_s32(vacc0x4), vget_high_s32(vacc0x4));
+    const int32x2_t vpsum0x5 = vadd_s32(vget_low_s32(vacc0x5), vget_high_s32(vacc0x5));
+    const int32x2_t vpsum0x6 = vadd_s32(vget_low_s32(vacc0x6), vget_high_s32(vacc0x6));
+    const int32x2_t vpsum0x7 = vadd_s32(vget_low_s32(vacc0x7), vget_high_s32(vacc0x7));
+    const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5);
+    const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7);
+    int32x4_t vacc0x4567 = vcombine_s32(vsum0x45, vsum0x67 );
+    const int32x2_t vpsum0x8 = vadd_s32(vget_low_s32(vacc0x8), vget_high_s32(vacc0x8));
+    const int32x2_t vpsum0x9 = vadd_s32(vget_low_s32(vacc0x9), vget_high_s32(vacc0x9));
+    const int32x2_t vpsum0xA = vadd_s32(vget_low_s32(vacc0x10), vget_high_s32(vacc0x10));
+    const int32x2_t vpsum0xB = vadd_s32(vget_low_s32(vacc0x11), vget_high_s32(vacc0x11));
+    const int32x2_t vsum0x89 = vpadd_s32(vpsum0x8, vpsum0x9);
+    const int32x2_t vsum0xAB = vpadd_s32(vpsum0xA, vpsum0xB);
+    int32x4_t vacc0x89AB = vcombine_s32(vsum0x89, vsum0xAB );
+    const int32x2_t vpsum0xC = vadd_s32(vget_low_s32(vacc0x12), vget_high_s32(vacc0x12));
+    const int32x2_t vpsum0xD = vadd_s32(vget_low_s32(vacc0x13), vget_high_s32(vacc0x13));
+    const int32x2_t vpsum0xE = vadd_s32(vget_low_s32(vacc0x14), vget_high_s32(vacc0x14));
+    const int32x2_t vpsum0xF = vadd_s32(vget_low_s32(vacc0x15), vget_high_s32(vacc0x15));
+    const int32x2_t vsum0xCD = vpadd_s32(vpsum0xC, vpsum0xD);
+    const int32x2_t vsum0xEF = vpadd_s32(vpsum0xE, vpsum0xF);
+    int32x4_t vacc0xCDEF = vcombine_s32(vsum0xCD, vsum0xEF );
+    const int32x2_t vpsum1x0 = vadd_s32(vget_low_s32(vacc1x0), vget_high_s32(vacc1x0));
+    const int32x2_t vpsum1x1 = vadd_s32(vget_low_s32(vacc1x1), vget_high_s32(vacc1x1));
+    const int32x2_t vpsum1x2 = vadd_s32(vget_low_s32(vacc1x2), vget_high_s32(vacc1x2));
+    const int32x2_t vpsum1x3 = vadd_s32(vget_low_s32(vacc1x3), vget_high_s32(vacc1x3));
+    const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1);
+    const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3);
+    int32x4_t vacc1x0123 = vcombine_s32(vsum1x01, vsum1x23 );
+    const int32x2_t vpsum1x4 = vadd_s32(vget_low_s32(vacc1x4), vget_high_s32(vacc1x4));
+    const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5));
+    const int32x2_t vpsum1x6 = vadd_s32(vget_low_s32(vacc1x6), vget_high_s32(vacc1x6));
+    const int32x2_t vpsum1x7 = vadd_s32(vget_low_s32(vacc1x7), vget_high_s32(vacc1x7));
+    const int32x2_t vsum1x45 = vpadd_s32(vpsum1x4, vpsum1x5);
+    const int32x2_t vsum1x67 = vpadd_s32(vpsum1x6, vpsum1x7);
+    int32x4_t vacc1x4567 = vcombine_s32(vsum1x45, vsum1x67 );
+    const int32x2_t vpsum1x8 = vadd_s32(vget_low_s32(vacc1x8), vget_high_s32(vacc1x8));
+    const int32x2_t vpsum1x9 = vadd_s32(vget_low_s32(vacc1x9), vget_high_s32(vacc1x9));
+    const int32x2_t vpsum1xA = vadd_s32(vget_low_s32(vacc1x10), vget_high_s32(vacc1x10));
+    const int32x2_t vpsum1xB = vadd_s32(vget_low_s32(vacc1x11), vget_high_s32(vacc1x11));
+    const int32x2_t vsum1x89 = vpadd_s32(vpsum1x8, vpsum1x9);
+    const int32x2_t vsum1xAB = vpadd_s32(vpsum1xA, vpsum1xB);
+    int32x4_t vacc1x89AB = vcombine_s32(vsum1x89, vsum1xAB );
+    const int32x2_t vpsum1xC = vadd_s32(vget_low_s32(vacc1x12), vget_high_s32(vacc1x12));
+    const int32x2_t vpsum1xD = vadd_s32(vget_low_s32(vacc1x13), vget_high_s32(vacc1x13));
+    const int32x2_t vpsum1xE = vadd_s32(vget_low_s32(vacc1x14), vget_high_s32(vacc1x14));
+    const int32x2_t vpsum1xF = vadd_s32(vget_low_s32(vacc1x15), vget_high_s32(vacc1x15));
+    const int32x2_t vsum1xCD = vpadd_s32(vpsum1xC, vpsum1xD);
+    const int32x2_t vsum1xEF = vpadd_s32(vpsum1xE, vpsum1xF);
+    int32x4_t vacc1xCDEF = vcombine_s32(vsum1xCD, vsum1xEF );
+    const int32x2_t vpsum2x0 = vadd_s32(vget_low_s32(vacc2x0), vget_high_s32(vacc2x0));
+    const int32x2_t vpsum2x1 = vadd_s32(vget_low_s32(vacc2x1), vget_high_s32(vacc2x1));
+    const int32x2_t vpsum2x2 = vadd_s32(vget_low_s32(vacc2x2), vget_high_s32(vacc2x2));
+    const int32x2_t vpsum2x3 = vadd_s32(vget_low_s32(vacc2x3), vget_high_s32(vacc2x3));
+    const int32x2_t vsum2x01 = vpadd_s32(vpsum2x0, vpsum2x1);
+    const int32x2_t vsum2x23 = vpadd_s32(vpsum2x2, vpsum2x3);
+    int32x4_t vacc2x0123 = vcombine_s32(vsum2x01, vsum2x23 );
+    const int32x2_t vpsum2x4 = vadd_s32(vget_low_s32(vacc2x4), vget_high_s32(vacc2x4));
+    const int32x2_t vpsum2x5 = vadd_s32(vget_low_s32(vacc2x5), vget_high_s32(vacc2x5));
+    const int32x2_t vpsum2x6 = vadd_s32(vget_low_s32(vacc2x6), vget_high_s32(vacc2x6));
+    const int32x2_t vpsum2x7 = vadd_s32(vget_low_s32(vacc2x7), vget_high_s32(vacc2x7));
+    const int32x2_t vsum2x45 = vpadd_s32(vpsum2x4, vpsum2x5);
+    const int32x2_t vsum2x67 = vpadd_s32(vpsum2x6, vpsum2x7);
+    int32x4_t vacc2x4567 = vcombine_s32(vsum2x45, vsum2x67 );
+    const int32x2_t vpsum2x8 = vadd_s32(vget_low_s32(vacc2x8), vget_high_s32(vacc2x8));
+    const int32x2_t vpsum2x9 = vadd_s32(vget_low_s32(vacc2x9), vget_high_s32(vacc2x9));
+    const int32x2_t vpsum2xA = vadd_s32(vget_low_s32(vacc2x10), vget_high_s32(vacc2x10));
+    const int32x2_t vpsum2xB = vadd_s32(vget_low_s32(vacc2x11), vget_high_s32(vacc2x11));
+    const int32x2_t vsum2x89 = vpadd_s32(vpsum2x8, vpsum2x9);
+    const int32x2_t vsum2xAB = vpadd_s32(vpsum2xA, vpsum2xB);
+    int32x4_t vacc2x89AB = vcombine_s32(vsum2x89, vsum2xAB );
+    const int32x2_t vpsum2xC = vadd_s32(vget_low_s32(vacc2x12), vget_high_s32(vacc2x12));
+    const int32x2_t vpsum2xD = vadd_s32(vget_low_s32(vacc2x13), vget_high_s32(vacc2x13));
+    const int32x2_t vpsum2xE = vadd_s32(vget_low_s32(vacc2x14), vget_high_s32(vacc2x14));
+    const int32x2_t vpsum2xF = vadd_s32(vget_low_s32(vacc2x15), vget_high_s32(vacc2x15));
+    const int32x2_t vsum2xCD = vpadd_s32(vpsum2xC, vpsum2xD);
+    const int32x2_t vsum2xEF = vpadd_s32(vpsum2xE, vpsum2xF);
+    int32x4_t vacc2xCDEF = vcombine_s32(vsum2xCD, vsum2xEF );
+#endif
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+    vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc1x89AB = vqrdmulhq_s32(vacc1x89AB, vmultiplier);
+    vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+    vacc2x89AB = vqrdmulhq_s32(vacc2x89AB, vmultiplier);
+    vacc2xCDEF = vqrdmulhq_s32(vacc2xCDEF, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+    vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc1x89AB = vsraq_n_s32(vacc1x89AB, vbicq_s32(vacc1x89AB, vzero_shift_mask), 31);
+    vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+    vacc2x89AB = vsraq_n_s32(vacc2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31);
+    vacc2xCDEF = vsraq_n_s32(vacc2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_shift);
+    vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+    vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_shift);
+    vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), voutput_zero_point);
+    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+    int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
+    int8x16_t vout2x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc2x89ABCDEF);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+    int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
+    int8x16_t vout2x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc2x89ABCDEF));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+    vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
+    vout2x0123456789ABCDEF = vmaxq_s8(vout2x0123456789ABCDEF, voutput_min);
+
+    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+    vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
+    vout2x0123456789ABCDEF = vminq_s8(vout2x0123456789ABCDEF, voutput_max);
+
+    if (nc >= 16) {
+      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+      vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
+      vst1q_s8(c2 + 0, vout2x0123456789ABCDEF);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
+      nc -= 16;
+    } else {
+      int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
+      int8x8_t vout2x01234567 = vget_low_s8(vout2x0123456789ABCDEF);
+      if (nc & 8) {
+        vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
+        vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
+        vst1_s8(c2, vout2x01234567); c2 += 8;
+        vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
+        vout2x01234567 = vget_high_s8(vout2x0123456789ABCDEF);
+      }
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1_lane_s8(c2, vout2x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/3x16c2-minmax-neon-mlal-padal-dup.c b/src/qs8-gemm/gen/3x16c2-minmax-neon-mlal-padal-dup.c
new file mode 100644
index 0000000..e3d71f4
--- /dev/null
+++ b/src/qs8-gemm/gen/3x16c2-minmax-neon-mlal-padal-dup.c
@@ -0,0 +1,605 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/c2-neon-mull-padal-dup.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc1x0123 = vacc0x0123;
+    int32x4_t vacc1x4567 = vacc0x4567;
+    int32x4_t vacc1x89AB = vacc0x89AB;
+    int32x4_t vacc1xCDEF = vacc0xCDEF;
+    int32x4_t vacc2x0123 = vacc0x0123;
+    int32x4_t vacc2x4567 = vacc0x4567;
+    int32x4_t vacc2x89AB = vacc0x89AB;
+    int32x4_t vacc2xCDEF = vacc0xCDEF;
+
+    size_t k = kc;
+
+    while (k >= 16 * sizeof(int8_t)) {
+      const int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va0x1 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va1x0 = vld1_s8(a1); a1 += 8;
+      const int8x8_t va1x1 = vld1_s8(a1); a1 += 8;
+      const int8x8_t va2x0 = vld1_s8(a2); a2 += 8;
+      const int8x8_t va2x1 = vld1_s8(a2); a2 += 8;
+
+      const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+      int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+      int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0)));
+      const int8x8_t vb0123c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+      vprod1x0123c0 = vmlal_s8(vprod1x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+      vprod2x0123c0 = vmlal_s8(vprod2x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 0)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+      int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+      int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+      int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0)));
+      const int8x8_t vb4567c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+      vprod1x4567c0 = vmlal_s8(vprod1x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+      vprod2x4567c0 = vmlal_s8(vprod2x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 0)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+      int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+      int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+      int16x8_t vprod2x89ABc0 = vmull_s8(vb89ABc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0)));
+      const int8x8_t vb89ABc0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x89ABc0 = vmlal_s8(vprod0x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+      vprod1x89ABc0 = vmlal_s8(vprod1x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+      vprod2x89ABc0 = vmlal_s8(vprod2x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 0)));
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+      vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
+      vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0);
+      int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+      int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+      int16x8_t vprod2xCDEFc0 = vmull_s8(vbCDEFc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0)));
+      const int8x8_t vbCDEFc0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0xCDEFc0 = vmlal_s8(vprod0xCDEFc0, vbCDEFc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+      vprod1xCDEFc0 = vmlal_s8(vprod1xCDEFc0, vbCDEFc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+      vprod2xCDEFc0 = vmlal_s8(vprod2xCDEFc0, vbCDEFc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 0)));
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+      vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
+      vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0);
+      int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+      int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+      int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1)));
+      const int8x8_t vb0123c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x0123c1 = vmlal_s8(vprod0x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+      vprod1x0123c1 = vmlal_s8(vprod1x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+      vprod2x0123c1 = vmlal_s8(vprod2x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 1)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+      int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+      int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+      int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1)));
+      const int8x8_t vb4567c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x4567c1 = vmlal_s8(vprod0x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+      vprod1x4567c1 = vmlal_s8(vprod1x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+      vprod2x4567c1 = vmlal_s8(vprod2x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 1)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+      int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+      int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+      int16x8_t vprod2x89ABc1 = vmull_s8(vb89ABc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1)));
+      const int8x8_t vb89ABc1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x89ABc1 = vmlal_s8(vprod0x89ABc1, vb89ABc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+      vprod1x89ABc1 = vmlal_s8(vprod1x89ABc1, vb89ABc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+      vprod2x89ABc1 = vmlal_s8(vprod2x89ABc1, vb89ABc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 1)));
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+      vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
+      vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1);
+      int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+      int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+      int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1)));
+      const int8x8_t vbCDEFc1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0xCDEFc1 = vmlal_s8(vprod0xCDEFc1, vbCDEFc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+      vprod1xCDEFc1 = vmlal_s8(vprod1xCDEFc1, vbCDEFc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+      vprod2xCDEFc1 = vmlal_s8(vprod2xCDEFc1, vbCDEFc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 1)));
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+      vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
+      vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1);
+      int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+      int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+      int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 2)));
+      const int8x8_t vb0123c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x0123c2 = vmlal_s8(vprod0x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+      vprod1x0123c2 = vmlal_s8(vprod1x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+      vprod2x0123c2 = vmlal_s8(vprod2x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 2)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+      int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+      int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+      int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 2)));
+      const int8x8_t vb4567c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x4567c2 = vmlal_s8(vprod0x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+      vprod1x4567c2 = vmlal_s8(vprod1x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+      vprod2x4567c2 = vmlal_s8(vprod2x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 2)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+      int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+      int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+      int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 2)));
+      const int8x8_t vb89ABc2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x89ABc2 = vmlal_s8(vprod0x89ABc2, vb89ABc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+      vprod1x89ABc2 = vmlal_s8(vprod1x89ABc2, vb89ABc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+      vprod2x89ABc2 = vmlal_s8(vprod2x89ABc2, vb89ABc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 2)));
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+      vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
+      vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2);
+      int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+      int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+      int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 2)));
+      const int8x8_t vbCDEFc2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0xCDEFc2 = vmlal_s8(vprod0xCDEFc2, vbCDEFc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+      vprod1xCDEFc2 = vmlal_s8(vprod1xCDEFc2, vbCDEFc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+      vprod2xCDEFc2 = vmlal_s8(vprod2xCDEFc2, vbCDEFc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 2)));
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+      vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
+      vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2);
+      int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+      int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+      int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 3)));
+      const int8x8_t vb0123c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x0123c3 = vmlal_s8(vprod0x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+      vprod1x0123c3 = vmlal_s8(vprod1x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+      vprod2x0123c3 = vmlal_s8(vprod2x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 3)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
+      int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+      int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+      int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 3)));
+      const int8x8_t vb4567c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x4567c3 = vmlal_s8(vprod0x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+      vprod1x4567c3 = vmlal_s8(vprod1x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+      vprod2x4567c3 = vmlal_s8(vprod2x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 3)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
+      int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+      int16x8_t vprod1x89ABc3 = vmull_s8(vb89ABc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+      int16x8_t vprod2x89ABc3 = vmull_s8(vb89ABc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 3)));
+      const int8x8_t vb89ABc3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x89ABc3 = vmlal_s8(vprod0x89ABc3, vb89ABc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+      vprod1x89ABc3 = vmlal_s8(vprod1x89ABc3, vb89ABc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+      vprod2x89ABc3 = vmlal_s8(vprod2x89ABc3, vb89ABc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 3)));
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
+      vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc3);
+      vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3);
+      int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+      int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+      int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 3)));
+      const int8x8_t vbCDEFc3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0xCDEFc3 = vmlal_s8(vprod0xCDEFc3, vbCDEFc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+      vprod1xCDEFc3 = vmlal_s8(vprod1xCDEFc3, vbCDEFc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+      vprod2xCDEFc3 = vmlal_s8(vprod2xCDEFc3, vbCDEFc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 3)));
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
+      vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3);
+      vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3);
+
+      k -= 16 * sizeof(int8_t);
+    }
+
+    if (k >= 8 * sizeof(int8_t)) {
+      const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+      const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+
+      const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+      const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+      const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+      const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+      const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+      const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+      const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+      const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+      const int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
+      const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+      const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+      const int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
+      const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+      const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+      const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+      const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+      const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+      const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+      const int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      const int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+      const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+      const int16x8_t vprod1x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+      vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
+      vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
+      vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
+      vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc3);
+      const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+      const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+      const int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+      vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
+      vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
+      vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
+      vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3);
+      const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+      const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+      const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+      const int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
+      const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+      const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+      const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+      const int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
+      const int16x8_t vprod2x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+      const int16x8_t vprod2x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+      const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+      const int16x8_t vprod2x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+      vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0);
+      vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1);
+      vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2);
+      vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3);
+      const int16x8_t vprod2xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+      const int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+      const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+      const int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+      vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0);
+      vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1);
+      vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2);
+      vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3);
+
+      k -= 8 * sizeof(int8_t);
+    }
+
+    if XNN_UNLIKELY(k != 0) {
+      const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+      const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+      const int8x8_t va2 = vld1_s8(a2); a2 = (const int8_t*) ((uintptr_t) a2 + k);
+
+      const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+      const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+      const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+      const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+      const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+      const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+      const int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
+      const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
+      const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+      const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+      const int16x8_t vprod2x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+      vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0);
+      const int16x8_t vprod2xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+      vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0);
+
+      if (k > 2 * sizeof(int8_t)) {
+        const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+        const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+        const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+        const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+        const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+        const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+        const int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
+        const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
+        const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+        const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+        const int16x8_t vprod2x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+        vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1);
+        const int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+        vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1);
+
+        if (k > 4 * sizeof(int8_t)) {
+          const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+          vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+          const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+          vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+          const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+          vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+          const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+          vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+          const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+          vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+          const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+          vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+          const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+          vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
+          const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+          vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
+          const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+          vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+          const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+          vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+          const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+          vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2);
+          const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+          vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2);
+        }
+      }
+    }
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+    vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc1x89AB = vqrdmulhq_s32(vacc1x89AB, vmultiplier);
+    vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+    vacc2x89AB = vqrdmulhq_s32(vacc2x89AB, vmultiplier);
+    vacc2xCDEF = vqrdmulhq_s32(vacc2xCDEF, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+    vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc1x89AB = vsraq_n_s32(vacc1x89AB, vbicq_s32(vacc1x89AB, vzero_shift_mask), 31);
+    vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+    vacc2x89AB = vsraq_n_s32(vacc2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31);
+    vacc2xCDEF = vsraq_n_s32(vacc2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_shift);
+    vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+    vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_shift);
+    vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+    int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
+    int8x16_t vout2x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc2x89ABCDEF);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+    int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
+    int8x16_t vout2x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc2x89ABCDEF));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+    vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
+    vout2x0123456789ABCDEF = vmaxq_s8(vout2x0123456789ABCDEF, voutput_min);
+
+    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+    vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
+    vout2x0123456789ABCDEF = vminq_s8(vout2x0123456789ABCDEF, voutput_max);
+
+    if (nc >= 16) {
+      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+      vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
+      vst1q_s8(c2 + 0, vout2x0123456789ABCDEF);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
+      nc -= 16;
+    } else {
+      int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
+      int8x8_t vout2x01234567 = vget_low_s8(vout2x0123456789ABCDEF);
+      if (nc & 8) {
+        vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
+        vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
+        vst1_s8(c2, vout2x01234567); c2 += 8;
+        vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
+        vout2x01234567 = vget_high_s8(vout2x0123456789ABCDEF);
+      }
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1_lane_s8(c2, vout2x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/3x16c2-minmax-neon-mull-padal-dup.c b/src/qs8-gemm/gen/3x16c2-minmax-neon-mull-padal-dup.c
new file mode 100644
index 0000000..68671c7
--- /dev/null
+++ b/src/qs8-gemm/gen/3x16c2-minmax-neon-mull-padal-dup.c
@@ -0,0 +1,417 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/c2-neon-mull-padal-dup.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc1x0123 = vacc0x0123;
+    int32x4_t vacc1x4567 = vacc0x4567;
+    int32x4_t vacc1x89AB = vacc0x89AB;
+    int32x4_t vacc1xCDEF = vacc0xCDEF;
+    int32x4_t vacc2x0123 = vacc0x0123;
+    int32x4_t vacc2x4567 = vacc0x4567;
+    int32x4_t vacc2x89AB = vacc0x89AB;
+    int32x4_t vacc2xCDEF = vacc0xCDEF;
+
+    size_t k = kc;
+
+
+    while (k >= 8 * sizeof(int8_t)) {
+      const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+      const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+
+      const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+      const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+      const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+      const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+      const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+      const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+      const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+      const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+      const int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
+      const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+      const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+      const int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
+      const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+      const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+      const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+      const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+      const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+      const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+      const int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      const int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+      const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+      const int16x8_t vprod1x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+      vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
+      vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
+      vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
+      vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc3);
+      const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+      const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+      const int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+      vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
+      vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
+      vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
+      vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3);
+      const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+      const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+      const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+      const int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
+      const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+      const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+      const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+      const int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
+      const int16x8_t vprod2x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+      const int16x8_t vprod2x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+      const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+      const int16x8_t vprod2x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+      vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0);
+      vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1);
+      vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2);
+      vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3);
+      const int16x8_t vprod2xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+      const int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+      const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+      const int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+      vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0);
+      vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1);
+      vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2);
+      vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3);
+
+      k -= 8 * sizeof(int8_t);
+    }
+
+    if XNN_UNLIKELY(k != 0) {
+      const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+      const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+      const int8x8_t va2 = vld1_s8(a2); a2 = (const int8_t*) ((uintptr_t) a2 + k);
+
+      const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+      const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+      const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+      const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+      const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+      const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+      const int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
+      const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
+      const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+      const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+      const int16x8_t vprod2x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+      vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0);
+      const int16x8_t vprod2xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+      vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0);
+
+      if (k > 2 * sizeof(int8_t)) {
+        const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+        const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+        const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+        const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+        const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+        const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+        const int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
+        const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
+        const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+        const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+        const int16x8_t vprod2x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+        vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1);
+        const int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+        vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1);
+
+        if (k > 4 * sizeof(int8_t)) {
+          const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+          vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+          const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+          vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+          const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+          vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+          const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+          vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+          const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+          vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+          const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+          vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+          const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+          vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
+          const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+          vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
+          const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+          vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+          const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+          vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+          const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+          vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2);
+          const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+          vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2);
+        }
+      }
+    }
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+    vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc1x89AB = vqrdmulhq_s32(vacc1x89AB, vmultiplier);
+    vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+    vacc2x89AB = vqrdmulhq_s32(vacc2x89AB, vmultiplier);
+    vacc2xCDEF = vqrdmulhq_s32(vacc2xCDEF, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+    vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc1x89AB = vsraq_n_s32(vacc1x89AB, vbicq_s32(vacc1x89AB, vzero_shift_mask), 31);
+    vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+    vacc2x89AB = vsraq_n_s32(vacc2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31);
+    vacc2xCDEF = vsraq_n_s32(vacc2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_shift);
+    vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+    vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_shift);
+    vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+    int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
+    int8x16_t vout2x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc2x89ABCDEF);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+    int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
+    int8x16_t vout2x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc2x89ABCDEF));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+    vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
+    vout2x0123456789ABCDEF = vmaxq_s8(vout2x0123456789ABCDEF, voutput_min);
+
+    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+    vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
+    vout2x0123456789ABCDEF = vminq_s8(vout2x0123456789ABCDEF, voutput_max);
+
+    if (nc >= 16) {
+      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+      vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
+      vst1q_s8(c2 + 0, vout2x0123456789ABCDEF);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
+      nc -= 16;
+    } else {
+      int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
+      int8x8_t vout2x01234567 = vget_low_s8(vout2x0123456789ABCDEF);
+      if (nc & 8) {
+        vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
+        vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
+        vst1_s8(c2, vout2x01234567); c2 += 8;
+        vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
+        vout2x01234567 = vget_high_s8(vout2x0123456789ABCDEF);
+      }
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1_lane_s8(c2, vout2x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/3x16c8-minmax-avx512skx.c b/src/qs8-gemm/gen/3x16c8-minmax-avx512skx.c
index b5e91ec..9970bc3 100644
--- a/src/qs8-gemm/gen/3x16c8-minmax-avx512skx.c
+++ b/src/qs8-gemm/gen/3x16c8-minmax-avx512skx.c
@@ -13,6 +13,7 @@
 
 #include <xnnpack/gemm.h>
 #include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx(
@@ -36,6 +37,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
diff --git a/src/qs8-gemm/gen/3x16c8-minmax-neon-mlal-padal.c b/src/qs8-gemm/gen/3x16c8-minmax-neon-mlal-padal.c
new file mode 100644
index 0000000..c9cca37
--- /dev/null
+++ b/src/qs8-gemm/gen/3x16c8-minmax-neon-mlal-padal.c
@@ -0,0 +1,665 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/c8-neon-mull-padal.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+
+  do {
+    int32x4_t vacc0x0 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x1 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x2 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x3 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x4 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x5 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x8 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x9 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x10 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x11 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x12 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x13 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x14 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x15 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc1x0 = vacc0x0;
+    int32x4_t vacc1x1 = vacc0x1;
+    int32x4_t vacc1x2 = vacc0x2;
+    int32x4_t vacc1x3 = vacc0x3;
+    int32x4_t vacc1x4 = vacc0x4;
+    int32x4_t vacc1x5 = vacc0x5;
+    int32x4_t vacc1x6 = vacc0x6;
+    int32x4_t vacc1x7 = vacc0x7;
+    int32x4_t vacc1x8 = vacc0x8;
+    int32x4_t vacc1x9 = vacc0x9;
+    int32x4_t vacc1x10 = vacc0x10;
+    int32x4_t vacc1x11 = vacc0x11;
+    int32x4_t vacc1x12 = vacc0x12;
+    int32x4_t vacc1x13 = vacc0x13;
+    int32x4_t vacc1x14 = vacc0x14;
+    int32x4_t vacc1x15 = vacc0x15;
+    int32x4_t vacc2x0 = vacc0x0;
+    int32x4_t vacc2x1 = vacc0x1;
+    int32x4_t vacc2x2 = vacc0x2;
+    int32x4_t vacc2x3 = vacc0x3;
+    int32x4_t vacc2x4 = vacc0x4;
+    int32x4_t vacc2x5 = vacc0x5;
+    int32x4_t vacc2x6 = vacc0x6;
+    int32x4_t vacc2x7 = vacc0x7;
+    int32x4_t vacc2x8 = vacc0x8;
+    int32x4_t vacc2x9 = vacc0x9;
+    int32x4_t vacc2x10 = vacc0x10;
+    int32x4_t vacc2x11 = vacc0x11;
+    int32x4_t vacc2x12 = vacc0x12;
+    int32x4_t vacc2x13 = vacc0x13;
+    int32x4_t vacc2x14 = vacc0x14;
+    int32x4_t vacc2x15 = vacc0x15;
+
+    size_t k = kc;
+    // 2x partial unrolled loop to load 16 bytes at a time using MLA.
+    while (k >= 16 * sizeof(int8_t)) {
+      const int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va0x1 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va1x0 = vld1_s8(a1); a1 += 8;
+      const int8x8_t va1x1 = vld1_s8(a1); a1 += 8;
+      const int8x8_t va2x0 = vld1_s8(a2); a2 += 8;
+      const int8x8_t va2x1 = vld1_s8(a2); a2 += 8;
+
+      const int8x8_t vb0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb4x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb5x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb6x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb7x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb8x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb9x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb10x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb11x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb12x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb13x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb14x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb15x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+
+      const int8x8_t vb0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x0 = vmull_s8(vb0x0, va0x0);
+      int16x8_t vprod1x0 = vmull_s8(vb0x0, va1x0);
+      int16x8_t vprod2x0 = vmull_s8(vb0x0, va2x0);
+      vprod0x0 = vmlal_s8(vprod0x0, vb0x1, va0x1);
+      vprod1x0 = vmlal_s8(vprod1x0, vb0x1, va1x1);
+      vprod2x0 = vmlal_s8(vprod2x0, vb0x1, va2x1);
+      vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+      vacc1x0 = vpadalq_s16(vacc1x0, vprod1x0);
+      vacc2x0 = vpadalq_s16(vacc2x0, vprod2x0);
+      const int8x8_t vb1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x1 = vmull_s8(vb1x0, va0x0);
+      int16x8_t vprod1x1 = vmull_s8(vb1x0, va1x0);
+      int16x8_t vprod2x1 = vmull_s8(vb1x0, va2x0);
+      vprod0x1 = vmlal_s8(vprod0x1, vb1x1, va0x1);
+      vprod1x1 = vmlal_s8(vprod1x1, vb1x1, va1x1);
+      vprod2x1 = vmlal_s8(vprod2x1, vb1x1, va2x1);
+      vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+      vacc1x1 = vpadalq_s16(vacc1x1, vprod1x1);
+      vacc2x1 = vpadalq_s16(vacc2x1, vprod2x1);
+      const int8x8_t vb2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x2 = vmull_s8(vb2x0, va0x0);
+      int16x8_t vprod1x2 = vmull_s8(vb2x0, va1x0);
+      int16x8_t vprod2x2 = vmull_s8(vb2x0, va2x0);
+      vprod0x2 = vmlal_s8(vprod0x2, vb2x1, va0x1);
+      vprod1x2 = vmlal_s8(vprod1x2, vb2x1, va1x1);
+      vprod2x2 = vmlal_s8(vprod2x2, vb2x1, va2x1);
+      vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+      vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2);
+      vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2);
+      const int8x8_t vb3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x3 = vmull_s8(vb3x0, va0x0);
+      int16x8_t vprod1x3 = vmull_s8(vb3x0, va1x0);
+      int16x8_t vprod2x3 = vmull_s8(vb3x0, va2x0);
+      vprod0x3 = vmlal_s8(vprod0x3, vb3x1, va0x1);
+      vprod1x3 = vmlal_s8(vprod1x3, vb3x1, va1x1);
+      vprod2x3 = vmlal_s8(vprod2x3, vb3x1, va2x1);
+      vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+      vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3);
+      vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3);
+      const int8x8_t vb4x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x4 = vmull_s8(vb4x0, va0x0);
+      int16x8_t vprod1x4 = vmull_s8(vb4x0, va1x0);
+      int16x8_t vprod2x4 = vmull_s8(vb4x0, va2x0);
+      vprod0x4 = vmlal_s8(vprod0x4, vb4x1, va0x1);
+      vprod1x4 = vmlal_s8(vprod1x4, vb4x1, va1x1);
+      vprod2x4 = vmlal_s8(vprod2x4, vb4x1, va2x1);
+      vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+      vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4);
+      vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4);
+      const int8x8_t vb5x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x5 = vmull_s8(vb5x0, va0x0);
+      int16x8_t vprod1x5 = vmull_s8(vb5x0, va1x0);
+      int16x8_t vprod2x5 = vmull_s8(vb5x0, va2x0);
+      vprod0x5 = vmlal_s8(vprod0x5, vb5x1, va0x1);
+      vprod1x5 = vmlal_s8(vprod1x5, vb5x1, va1x1);
+      vprod2x5 = vmlal_s8(vprod2x5, vb5x1, va2x1);
+      vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+      vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5);
+      vacc2x5 = vpadalq_s16(vacc2x5, vprod2x5);
+      const int8x8_t vb6x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x6 = vmull_s8(vb6x0, va0x0);
+      int16x8_t vprod1x6 = vmull_s8(vb6x0, va1x0);
+      int16x8_t vprod2x6 = vmull_s8(vb6x0, va2x0);
+      vprod0x6 = vmlal_s8(vprod0x6, vb6x1, va0x1);
+      vprod1x6 = vmlal_s8(vprod1x6, vb6x1, va1x1);
+      vprod2x6 = vmlal_s8(vprod2x6, vb6x1, va2x1);
+      vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+      vacc1x6 = vpadalq_s16(vacc1x6, vprod1x6);
+      vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6);
+      const int8x8_t vb7x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x7 = vmull_s8(vb7x0, va0x0);
+      int16x8_t vprod1x7 = vmull_s8(vb7x0, va1x0);
+      int16x8_t vprod2x7 = vmull_s8(vb7x0, va2x0);
+      vprod0x7 = vmlal_s8(vprod0x7, vb7x1, va0x1);
+      vprod1x7 = vmlal_s8(vprod1x7, vb7x1, va1x1);
+      vprod2x7 = vmlal_s8(vprod2x7, vb7x1, va2x1);
+      vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+      vacc1x7 = vpadalq_s16(vacc1x7, vprod1x7);
+      vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7);
+      const int8x8_t vb8x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x8 = vmull_s8(vb8x0, va0x0);
+      int16x8_t vprod1x8 = vmull_s8(vb8x0, va1x0);
+      int16x8_t vprod2x8 = vmull_s8(vb8x0, va2x0);
+      vprod0x8 = vmlal_s8(vprod0x8, vb8x1, va0x1);
+      vprod1x8 = vmlal_s8(vprod1x8, vb8x1, va1x1);
+      vprod2x8 = vmlal_s8(vprod2x8, vb8x1, va2x1);
+      vacc0x8 = vpadalq_s16(vacc0x8, vprod0x8);
+      vacc1x8 = vpadalq_s16(vacc1x8, vprod1x8);
+      vacc2x8 = vpadalq_s16(vacc2x8, vprod2x8);
+      const int8x8_t vb9x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x9 = vmull_s8(vb9x0, va0x0);
+      int16x8_t vprod1x9 = vmull_s8(vb9x0, va1x0);
+      int16x8_t vprod2x9 = vmull_s8(vb9x0, va2x0);
+      vprod0x9 = vmlal_s8(vprod0x9, vb9x1, va0x1);
+      vprod1x9 = vmlal_s8(vprod1x9, vb9x1, va1x1);
+      vprod2x9 = vmlal_s8(vprod2x9, vb9x1, va2x1);
+      vacc0x9 = vpadalq_s16(vacc0x9, vprod0x9);
+      vacc1x9 = vpadalq_s16(vacc1x9, vprod1x9);
+      vacc2x9 = vpadalq_s16(vacc2x9, vprod2x9);
+      const int8x8_t vb10x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x10 = vmull_s8(vb10x0, va0x0);
+      int16x8_t vprod1x10 = vmull_s8(vb10x0, va1x0);
+      int16x8_t vprod2x10 = vmull_s8(vb10x0, va2x0);
+      vprod0x10 = vmlal_s8(vprod0x10, vb10x1, va0x1);
+      vprod1x10 = vmlal_s8(vprod1x10, vb10x1, va1x1);
+      vprod2x10 = vmlal_s8(vprod2x10, vb10x1, va2x1);
+      vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10);
+      vacc1x10 = vpadalq_s16(vacc1x10, vprod1x10);
+      vacc2x10 = vpadalq_s16(vacc2x10, vprod2x10);
+      const int8x8_t vb11x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x11 = vmull_s8(vb11x0, va0x0);
+      int16x8_t vprod1x11 = vmull_s8(vb11x0, va1x0);
+      int16x8_t vprod2x11 = vmull_s8(vb11x0, va2x0);
+      vprod0x11 = vmlal_s8(vprod0x11, vb11x1, va0x1);
+      vprod1x11 = vmlal_s8(vprod1x11, vb11x1, va1x1);
+      vprod2x11 = vmlal_s8(vprod2x11, vb11x1, va2x1);
+      vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11);
+      vacc1x11 = vpadalq_s16(vacc1x11, vprod1x11);
+      vacc2x11 = vpadalq_s16(vacc2x11, vprod2x11);
+      const int8x8_t vb12x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x12 = vmull_s8(vb12x0, va0x0);
+      int16x8_t vprod1x12 = vmull_s8(vb12x0, va1x0);
+      int16x8_t vprod2x12 = vmull_s8(vb12x0, va2x0);
+      vprod0x12 = vmlal_s8(vprod0x12, vb12x1, va0x1);
+      vprod1x12 = vmlal_s8(vprod1x12, vb12x1, va1x1);
+      vprod2x12 = vmlal_s8(vprod2x12, vb12x1, va2x1);
+      vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12);
+      vacc1x12 = vpadalq_s16(vacc1x12, vprod1x12);
+      vacc2x12 = vpadalq_s16(vacc2x12, vprod2x12);
+      const int8x8_t vb13x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x13 = vmull_s8(vb13x0, va0x0);
+      int16x8_t vprod1x13 = vmull_s8(vb13x0, va1x0);
+      int16x8_t vprod2x13 = vmull_s8(vb13x0, va2x0);
+      vprod0x13 = vmlal_s8(vprod0x13, vb13x1, va0x1);
+      vprod1x13 = vmlal_s8(vprod1x13, vb13x1, va1x1);
+      vprod2x13 = vmlal_s8(vprod2x13, vb13x1, va2x1);
+      vacc0x13 = vpadalq_s16(vacc0x13, vprod0x13);
+      vacc1x13 = vpadalq_s16(vacc1x13, vprod1x13);
+      vacc2x13 = vpadalq_s16(vacc2x13, vprod2x13);
+      const int8x8_t vb14x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x14 = vmull_s8(vb14x0, va0x0);
+      int16x8_t vprod1x14 = vmull_s8(vb14x0, va1x0);
+      int16x8_t vprod2x14 = vmull_s8(vb14x0, va2x0);
+      vprod0x14 = vmlal_s8(vprod0x14, vb14x1, va0x1);
+      vprod1x14 = vmlal_s8(vprod1x14, vb14x1, va1x1);
+      vprod2x14 = vmlal_s8(vprod2x14, vb14x1, va2x1);
+      vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14);
+      vacc1x14 = vpadalq_s16(vacc1x14, vprod1x14);
+      vacc2x14 = vpadalq_s16(vacc2x14, vprod2x14);
+      const int8x8_t vb15x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x15 = vmull_s8(vb15x0, va0x0);
+      int16x8_t vprod1x15 = vmull_s8(vb15x0, va1x0);
+      int16x8_t vprod2x15 = vmull_s8(vb15x0, va2x0);
+      vprod0x15 = vmlal_s8(vprod0x15, vb15x1, va0x1);
+      vprod1x15 = vmlal_s8(vprod1x15, vb15x1, va1x1);
+      vprod2x15 = vmlal_s8(vprod2x15, vb15x1, va2x1);
+      vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15);
+      vacc1x15 = vpadalq_s16(vacc1x15, vprod1x15);
+      vacc2x15 = vpadalq_s16(vacc2x15, vprod2x15);
+
+      k -= 16 * sizeof(int8_t);
+    }
+
+    // Handle 8 bytes at a time using MUL.
+    if (k > 0) {
+      const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+      const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+
+      const int8x8_t vb0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x0 = vmull_s8(vb0, va0);
+      const int16x8_t vprod1x0 = vmull_s8(vb0, va1);
+      const int16x8_t vprod2x0 = vmull_s8(vb0, va2);
+      vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+      vacc1x0 = vpadalq_s16(vacc1x0, vprod1x0);
+      vacc2x0 = vpadalq_s16(vacc2x0, vprod2x0);
+      const int8x8_t vb1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x1 = vmull_s8(vb1, va0);
+      const int16x8_t vprod1x1 = vmull_s8(vb1, va1);
+      const int16x8_t vprod2x1 = vmull_s8(vb1, va2);
+      vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+      vacc1x1 = vpadalq_s16(vacc1x1, vprod1x1);
+      vacc2x1 = vpadalq_s16(vacc2x1, vprod2x1);
+      const int8x8_t vb2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x2 = vmull_s8(vb2, va0);
+      const int16x8_t vprod1x2 = vmull_s8(vb2, va1);
+      const int16x8_t vprod2x2 = vmull_s8(vb2, va2);
+      vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+      vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2);
+      vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2);
+      const int8x8_t vb3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x3 = vmull_s8(vb3, va0);
+      const int16x8_t vprod1x3 = vmull_s8(vb3, va1);
+      const int16x8_t vprod2x3 = vmull_s8(vb3, va2);
+      vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+      vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3);
+      vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3);
+      const int8x8_t vb4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x4 = vmull_s8(vb4, va0);
+      const int16x8_t vprod1x4 = vmull_s8(vb4, va1);
+      const int16x8_t vprod2x4 = vmull_s8(vb4, va2);
+      vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+      vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4);
+      vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4);
+      const int8x8_t vb5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x5 = vmull_s8(vb5, va0);
+      const int16x8_t vprod1x5 = vmull_s8(vb5, va1);
+      const int16x8_t vprod2x5 = vmull_s8(vb5, va2);
+      vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+      vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5);
+      vacc2x5 = vpadalq_s16(vacc2x5, vprod2x5);
+      const int8x8_t vb6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x6 = vmull_s8(vb6, va0);
+      const int16x8_t vprod1x6 = vmull_s8(vb6, va1);
+      const int16x8_t vprod2x6 = vmull_s8(vb6, va2);
+      vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+      vacc1x6 = vpadalq_s16(vacc1x6, vprod1x6);
+      vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6);
+      const int8x8_t vb7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x7 = vmull_s8(vb7, va0);
+      const int16x8_t vprod1x7 = vmull_s8(vb7, va1);
+      const int16x8_t vprod2x7 = vmull_s8(vb7, va2);
+      vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+      vacc1x7 = vpadalq_s16(vacc1x7, vprod1x7);
+      vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7);
+      const int8x8_t vb8 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x8 = vmull_s8(vb8, va0);
+      const int16x8_t vprod1x8 = vmull_s8(vb8, va1);
+      const int16x8_t vprod2x8 = vmull_s8(vb8, va2);
+      vacc0x8 = vpadalq_s16(vacc0x8, vprod0x8);
+      vacc1x8 = vpadalq_s16(vacc1x8, vprod1x8);
+      vacc2x8 = vpadalq_s16(vacc2x8, vprod2x8);
+      const int8x8_t vb9 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x9 = vmull_s8(vb9, va0);
+      const int16x8_t vprod1x9 = vmull_s8(vb9, va1);
+      const int16x8_t vprod2x9 = vmull_s8(vb9, va2);
+      vacc0x9 = vpadalq_s16(vacc0x9, vprod0x9);
+      vacc1x9 = vpadalq_s16(vacc1x9, vprod1x9);
+      vacc2x9 = vpadalq_s16(vacc2x9, vprod2x9);
+      const int8x8_t vb10 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x10 = vmull_s8(vb10, va0);
+      const int16x8_t vprod1x10 = vmull_s8(vb10, va1);
+      const int16x8_t vprod2x10 = vmull_s8(vb10, va2);
+      vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10);
+      vacc1x10 = vpadalq_s16(vacc1x10, vprod1x10);
+      vacc2x10 = vpadalq_s16(vacc2x10, vprod2x10);
+      const int8x8_t vb11 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x11 = vmull_s8(vb11, va0);
+      const int16x8_t vprod1x11 = vmull_s8(vb11, va1);
+      const int16x8_t vprod2x11 = vmull_s8(vb11, va2);
+      vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11);
+      vacc1x11 = vpadalq_s16(vacc1x11, vprod1x11);
+      vacc2x11 = vpadalq_s16(vacc2x11, vprod2x11);
+      const int8x8_t vb12 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x12 = vmull_s8(vb12, va0);
+      const int16x8_t vprod1x12 = vmull_s8(vb12, va1);
+      const int16x8_t vprod2x12 = vmull_s8(vb12, va2);
+      vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12);
+      vacc1x12 = vpadalq_s16(vacc1x12, vprod1x12);
+      vacc2x12 = vpadalq_s16(vacc2x12, vprod2x12);
+      const int8x8_t vb13 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x13 = vmull_s8(vb13, va0);
+      const int16x8_t vprod1x13 = vmull_s8(vb13, va1);
+      const int16x8_t vprod2x13 = vmull_s8(vb13, va2);
+      vacc0x13 = vpadalq_s16(vacc0x13, vprod0x13);
+      vacc1x13 = vpadalq_s16(vacc1x13, vprod1x13);
+      vacc2x13 = vpadalq_s16(vacc2x13, vprod2x13);
+      const int8x8_t vb14 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x14 = vmull_s8(vb14, va0);
+      const int16x8_t vprod1x14 = vmull_s8(vb14, va1);
+      const int16x8_t vprod2x14 = vmull_s8(vb14, va2);
+      vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14);
+      vacc1x14 = vpadalq_s16(vacc1x14, vprod1x14);
+      vacc2x14 = vpadalq_s16(vacc2x14, vprod2x14);
+      const int8x8_t vb15 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x15 = vmull_s8(vb15, va0);
+      const int16x8_t vprod1x15 = vmull_s8(vb15, va1);
+      const int16x8_t vprod2x15 = vmull_s8(vb15, va2);
+      vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15);
+      vacc1x15 = vpadalq_s16(vacc1x15, vprod1x15);
+      vacc2x15 = vpadalq_s16(vacc2x15, vprod2x15);
+
+      k -= 8 * sizeof(int8_t);
+    }
+
+#if XNN_ARCH_ARM64
+    const int32x4_t vsum0x01 = vpaddq_s32(vacc0x0, vacc0x1);
+    const int32x4_t vsum0x23 = vpaddq_s32(vacc0x2, vacc0x3);
+    const int32x4_t vsum0x45 = vpaddq_s32(vacc0x4, vacc0x5);
+    const int32x4_t vsum0x67 = vpaddq_s32(vacc0x6, vacc0x7);
+    const int32x4_t vsum0x89 = vpaddq_s32(vacc0x8, vacc0x9);
+    const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11);
+    const int32x4_t vsum0xCD = vpaddq_s32(vacc0x12, vacc0x13);
+    const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15);
+    const int32x4_t vsum1x01 = vpaddq_s32(vacc1x0, vacc1x1);
+    const int32x4_t vsum1x23 = vpaddq_s32(vacc1x2, vacc1x3);
+    const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5);
+    const int32x4_t vsum1x67 = vpaddq_s32(vacc1x6, vacc1x7);
+    const int32x4_t vsum1x89 = vpaddq_s32(vacc1x8, vacc1x9);
+    const int32x4_t vsum1xAB = vpaddq_s32(vacc1x10, vacc1x11);
+    const int32x4_t vsum1xCD = vpaddq_s32(vacc1x12, vacc1x13);
+    const int32x4_t vsum1xEF = vpaddq_s32(vacc1x14, vacc1x15);
+    const int32x4_t vsum2x01 = vpaddq_s32(vacc2x0, vacc2x1);
+    const int32x4_t vsum2x23 = vpaddq_s32(vacc2x2, vacc2x3);
+    const int32x4_t vsum2x45 = vpaddq_s32(vacc2x4, vacc2x5);
+    const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7);
+    const int32x4_t vsum2x89 = vpaddq_s32(vacc2x8, vacc2x9);
+    const int32x4_t vsum2xAB = vpaddq_s32(vacc2x10, vacc2x11);
+    const int32x4_t vsum2xCD = vpaddq_s32(vacc2x12, vacc2x13);
+    const int32x4_t vsum2xEF = vpaddq_s32(vacc2x14, vacc2x15);
+    int32x4_t vacc0x0123 = vpaddq_s32(vsum0x01, vsum0x23);
+    int32x4_t vacc0x4567 = vpaddq_s32(vsum0x45, vsum0x67);
+    int32x4_t vacc0x89AB = vpaddq_s32(vsum0x89, vsum0xAB);
+    int32x4_t vacc0xCDEF = vpaddq_s32(vsum0xCD, vsum0xEF);
+    int32x4_t vacc1x0123 = vpaddq_s32(vsum1x01, vsum1x23);
+    int32x4_t vacc1x4567 = vpaddq_s32(vsum1x45, vsum1x67);
+    int32x4_t vacc1x89AB = vpaddq_s32(vsum1x89, vsum1xAB);
+    int32x4_t vacc1xCDEF = vpaddq_s32(vsum1xCD, vsum1xEF);
+    int32x4_t vacc2x0123 = vpaddq_s32(vsum2x01, vsum2x23);
+    int32x4_t vacc2x4567 = vpaddq_s32(vsum2x45, vsum2x67);
+    int32x4_t vacc2x89AB = vpaddq_s32(vsum2x89, vsum2xAB);
+    int32x4_t vacc2xCDEF = vpaddq_s32(vsum2xCD, vsum2xEF);
+#else
+    const int32x2_t vpsum0x0 = vadd_s32(vget_low_s32(vacc0x0), vget_high_s32(vacc0x0));
+    const int32x2_t vpsum0x1 = vadd_s32(vget_low_s32(vacc0x1), vget_high_s32(vacc0x1));
+    const int32x2_t vpsum0x2 = vadd_s32(vget_low_s32(vacc0x2), vget_high_s32(vacc0x2));
+    const int32x2_t vpsum0x3 = vadd_s32(vget_low_s32(vacc0x3), vget_high_s32(vacc0x3));
+    const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1);
+    const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3);
+    int32x4_t vacc0x0123 = vcombine_s32(vsum0x01, vsum0x23 );
+    const int32x2_t vpsum0x4 = vadd_s32(vget_low_s32(vacc0x4), vget_high_s32(vacc0x4));
+    const int32x2_t vpsum0x5 = vadd_s32(vget_low_s32(vacc0x5), vget_high_s32(vacc0x5));
+    const int32x2_t vpsum0x6 = vadd_s32(vget_low_s32(vacc0x6), vget_high_s32(vacc0x6));
+    const int32x2_t vpsum0x7 = vadd_s32(vget_low_s32(vacc0x7), vget_high_s32(vacc0x7));
+    const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5);
+    const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7);
+    int32x4_t vacc0x4567 = vcombine_s32(vsum0x45, vsum0x67 );
+    const int32x2_t vpsum0x8 = vadd_s32(vget_low_s32(vacc0x8), vget_high_s32(vacc0x8));
+    const int32x2_t vpsum0x9 = vadd_s32(vget_low_s32(vacc0x9), vget_high_s32(vacc0x9));
+    const int32x2_t vpsum0xA = vadd_s32(vget_low_s32(vacc0x10), vget_high_s32(vacc0x10));
+    const int32x2_t vpsum0xB = vadd_s32(vget_low_s32(vacc0x11), vget_high_s32(vacc0x11));
+    const int32x2_t vsum0x89 = vpadd_s32(vpsum0x8, vpsum0x9);
+    const int32x2_t vsum0xAB = vpadd_s32(vpsum0xA, vpsum0xB);
+    int32x4_t vacc0x89AB = vcombine_s32(vsum0x89, vsum0xAB );
+    const int32x2_t vpsum0xC = vadd_s32(vget_low_s32(vacc0x12), vget_high_s32(vacc0x12));
+    const int32x2_t vpsum0xD = vadd_s32(vget_low_s32(vacc0x13), vget_high_s32(vacc0x13));
+    const int32x2_t vpsum0xE = vadd_s32(vget_low_s32(vacc0x14), vget_high_s32(vacc0x14));
+    const int32x2_t vpsum0xF = vadd_s32(vget_low_s32(vacc0x15), vget_high_s32(vacc0x15));
+    const int32x2_t vsum0xCD = vpadd_s32(vpsum0xC, vpsum0xD);
+    const int32x2_t vsum0xEF = vpadd_s32(vpsum0xE, vpsum0xF);
+    int32x4_t vacc0xCDEF = vcombine_s32(vsum0xCD, vsum0xEF );
+    const int32x2_t vpsum1x0 = vadd_s32(vget_low_s32(vacc1x0), vget_high_s32(vacc1x0));
+    const int32x2_t vpsum1x1 = vadd_s32(vget_low_s32(vacc1x1), vget_high_s32(vacc1x1));
+    const int32x2_t vpsum1x2 = vadd_s32(vget_low_s32(vacc1x2), vget_high_s32(vacc1x2));
+    const int32x2_t vpsum1x3 = vadd_s32(vget_low_s32(vacc1x3), vget_high_s32(vacc1x3));
+    const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1);
+    const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3);
+    int32x4_t vacc1x0123 = vcombine_s32(vsum1x01, vsum1x23 );
+    const int32x2_t vpsum1x4 = vadd_s32(vget_low_s32(vacc1x4), vget_high_s32(vacc1x4));
+    const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5));
+    const int32x2_t vpsum1x6 = vadd_s32(vget_low_s32(vacc1x6), vget_high_s32(vacc1x6));
+    const int32x2_t vpsum1x7 = vadd_s32(vget_low_s32(vacc1x7), vget_high_s32(vacc1x7));
+    const int32x2_t vsum1x45 = vpadd_s32(vpsum1x4, vpsum1x5);
+    const int32x2_t vsum1x67 = vpadd_s32(vpsum1x6, vpsum1x7);
+    int32x4_t vacc1x4567 = vcombine_s32(vsum1x45, vsum1x67 );
+    const int32x2_t vpsum1x8 = vadd_s32(vget_low_s32(vacc1x8), vget_high_s32(vacc1x8));
+    const int32x2_t vpsum1x9 = vadd_s32(vget_low_s32(vacc1x9), vget_high_s32(vacc1x9));
+    const int32x2_t vpsum1xA = vadd_s32(vget_low_s32(vacc1x10), vget_high_s32(vacc1x10));
+    const int32x2_t vpsum1xB = vadd_s32(vget_low_s32(vacc1x11), vget_high_s32(vacc1x11));
+    const int32x2_t vsum1x89 = vpadd_s32(vpsum1x8, vpsum1x9);
+    const int32x2_t vsum1xAB = vpadd_s32(vpsum1xA, vpsum1xB);
+    int32x4_t vacc1x89AB = vcombine_s32(vsum1x89, vsum1xAB );
+    const int32x2_t vpsum1xC = vadd_s32(vget_low_s32(vacc1x12), vget_high_s32(vacc1x12));
+    const int32x2_t vpsum1xD = vadd_s32(vget_low_s32(vacc1x13), vget_high_s32(vacc1x13));
+    const int32x2_t vpsum1xE = vadd_s32(vget_low_s32(vacc1x14), vget_high_s32(vacc1x14));
+    const int32x2_t vpsum1xF = vadd_s32(vget_low_s32(vacc1x15), vget_high_s32(vacc1x15));
+    const int32x2_t vsum1xCD = vpadd_s32(vpsum1xC, vpsum1xD);
+    const int32x2_t vsum1xEF = vpadd_s32(vpsum1xE, vpsum1xF);
+    int32x4_t vacc1xCDEF = vcombine_s32(vsum1xCD, vsum1xEF );
+    const int32x2_t vpsum2x0 = vadd_s32(vget_low_s32(vacc2x0), vget_high_s32(vacc2x0));
+    const int32x2_t vpsum2x1 = vadd_s32(vget_low_s32(vacc2x1), vget_high_s32(vacc2x1));
+    const int32x2_t vpsum2x2 = vadd_s32(vget_low_s32(vacc2x2), vget_high_s32(vacc2x2));
+    const int32x2_t vpsum2x3 = vadd_s32(vget_low_s32(vacc2x3), vget_high_s32(vacc2x3));
+    const int32x2_t vsum2x01 = vpadd_s32(vpsum2x0, vpsum2x1);
+    const int32x2_t vsum2x23 = vpadd_s32(vpsum2x2, vpsum2x3);
+    int32x4_t vacc2x0123 = vcombine_s32(vsum2x01, vsum2x23 );
+    const int32x2_t vpsum2x4 = vadd_s32(vget_low_s32(vacc2x4), vget_high_s32(vacc2x4));
+    const int32x2_t vpsum2x5 = vadd_s32(vget_low_s32(vacc2x5), vget_high_s32(vacc2x5));
+    const int32x2_t vpsum2x6 = vadd_s32(vget_low_s32(vacc2x6), vget_high_s32(vacc2x6));
+    const int32x2_t vpsum2x7 = vadd_s32(vget_low_s32(vacc2x7), vget_high_s32(vacc2x7));
+    const int32x2_t vsum2x45 = vpadd_s32(vpsum2x4, vpsum2x5);
+    const int32x2_t vsum2x67 = vpadd_s32(vpsum2x6, vpsum2x7);
+    int32x4_t vacc2x4567 = vcombine_s32(vsum2x45, vsum2x67 );
+    const int32x2_t vpsum2x8 = vadd_s32(vget_low_s32(vacc2x8), vget_high_s32(vacc2x8));
+    const int32x2_t vpsum2x9 = vadd_s32(vget_low_s32(vacc2x9), vget_high_s32(vacc2x9));
+    const int32x2_t vpsum2xA = vadd_s32(vget_low_s32(vacc2x10), vget_high_s32(vacc2x10));
+    const int32x2_t vpsum2xB = vadd_s32(vget_low_s32(vacc2x11), vget_high_s32(vacc2x11));
+    const int32x2_t vsum2x89 = vpadd_s32(vpsum2x8, vpsum2x9);
+    const int32x2_t vsum2xAB = vpadd_s32(vpsum2xA, vpsum2xB);
+    int32x4_t vacc2x89AB = vcombine_s32(vsum2x89, vsum2xAB );
+    const int32x2_t vpsum2xC = vadd_s32(vget_low_s32(vacc2x12), vget_high_s32(vacc2x12));
+    const int32x2_t vpsum2xD = vadd_s32(vget_low_s32(vacc2x13), vget_high_s32(vacc2x13));
+    const int32x2_t vpsum2xE = vadd_s32(vget_low_s32(vacc2x14), vget_high_s32(vacc2x14));
+    const int32x2_t vpsum2xF = vadd_s32(vget_low_s32(vacc2x15), vget_high_s32(vacc2x15));
+    const int32x2_t vsum2xCD = vpadd_s32(vpsum2xC, vpsum2xD);
+    const int32x2_t vsum2xEF = vpadd_s32(vpsum2xE, vpsum2xF);
+    int32x4_t vacc2xCDEF = vcombine_s32(vsum2xCD, vsum2xEF );
+#endif
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+    vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc1x89AB = vqrdmulhq_s32(vacc1x89AB, vmultiplier);
+    vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+    vacc2x89AB = vqrdmulhq_s32(vacc2x89AB, vmultiplier);
+    vacc2xCDEF = vqrdmulhq_s32(vacc2xCDEF, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+    vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc1x89AB = vsraq_n_s32(vacc1x89AB, vbicq_s32(vacc1x89AB, vzero_shift_mask), 31);
+    vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+    vacc2x89AB = vsraq_n_s32(vacc2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31);
+    vacc2xCDEF = vsraq_n_s32(vacc2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_shift);
+    vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+    vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_shift);
+    vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), voutput_zero_point);
+    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+    int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
+    int8x16_t vout2x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc2x89ABCDEF);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+    int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
+    int8x16_t vout2x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc2x89ABCDEF));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+    vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
+    vout2x0123456789ABCDEF = vmaxq_s8(vout2x0123456789ABCDEF, voutput_min);
+
+    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+    vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
+    vout2x0123456789ABCDEF = vminq_s8(vout2x0123456789ABCDEF, voutput_max);
+
+    if (nc >= 16) {
+      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+      vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
+      vst1q_s8(c2 + 0, vout2x0123456789ABCDEF);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
+      nc -= 16;
+    } else {
+      int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
+      int8x8_t vout2x01234567 = vget_low_s8(vout2x0123456789ABCDEF);
+      if (nc & 8) {
+        vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
+        vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
+        vst1_s8(c2, vout2x01234567); c2 += 8;
+        vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
+        vout2x01234567 = vget_high_s8(vout2x0123456789ABCDEF);
+      }
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1_lane_s8(c2, vout2x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/3x16c8-minmax-neon-mull-padal.c b/src/qs8-gemm/gen/3x16c8-minmax-neon-mull-padal.c
new file mode 100644
index 0000000..658630c
--- /dev/null
+++ b/src/qs8-gemm/gen/3x16c8-minmax-neon-mull-padal.c
@@ -0,0 +1,476 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/c8-neon-mull-padal.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+
+  do {
+    int32x4_t vacc0x0 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x1 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x2 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x3 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x4 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x5 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x8 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x9 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x10 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x11 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x12 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x13 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x14 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x15 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc1x0 = vacc0x0;
+    int32x4_t vacc1x1 = vacc0x1;
+    int32x4_t vacc1x2 = vacc0x2;
+    int32x4_t vacc1x3 = vacc0x3;
+    int32x4_t vacc1x4 = vacc0x4;
+    int32x4_t vacc1x5 = vacc0x5;
+    int32x4_t vacc1x6 = vacc0x6;
+    int32x4_t vacc1x7 = vacc0x7;
+    int32x4_t vacc1x8 = vacc0x8;
+    int32x4_t vacc1x9 = vacc0x9;
+    int32x4_t vacc1x10 = vacc0x10;
+    int32x4_t vacc1x11 = vacc0x11;
+    int32x4_t vacc1x12 = vacc0x12;
+    int32x4_t vacc1x13 = vacc0x13;
+    int32x4_t vacc1x14 = vacc0x14;
+    int32x4_t vacc1x15 = vacc0x15;
+    int32x4_t vacc2x0 = vacc0x0;
+    int32x4_t vacc2x1 = vacc0x1;
+    int32x4_t vacc2x2 = vacc0x2;
+    int32x4_t vacc2x3 = vacc0x3;
+    int32x4_t vacc2x4 = vacc0x4;
+    int32x4_t vacc2x5 = vacc0x5;
+    int32x4_t vacc2x6 = vacc0x6;
+    int32x4_t vacc2x7 = vacc0x7;
+    int32x4_t vacc2x8 = vacc0x8;
+    int32x4_t vacc2x9 = vacc0x9;
+    int32x4_t vacc2x10 = vacc0x10;
+    int32x4_t vacc2x11 = vacc0x11;
+    int32x4_t vacc2x12 = vacc0x12;
+    int32x4_t vacc2x13 = vacc0x13;
+    int32x4_t vacc2x14 = vacc0x14;
+    int32x4_t vacc2x15 = vacc0x15;
+
+    size_t k = kc;
+
+    // Handle 8 bytes at a time using MUL.
+    while (k > 0) {
+      const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+      const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+
+      const int8x8_t vb0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x0 = vmull_s8(vb0, va0);
+      const int16x8_t vprod1x0 = vmull_s8(vb0, va1);
+      const int16x8_t vprod2x0 = vmull_s8(vb0, va2);
+      vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+      vacc1x0 = vpadalq_s16(vacc1x0, vprod1x0);
+      vacc2x0 = vpadalq_s16(vacc2x0, vprod2x0);
+      const int8x8_t vb1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x1 = vmull_s8(vb1, va0);
+      const int16x8_t vprod1x1 = vmull_s8(vb1, va1);
+      const int16x8_t vprod2x1 = vmull_s8(vb1, va2);
+      vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+      vacc1x1 = vpadalq_s16(vacc1x1, vprod1x1);
+      vacc2x1 = vpadalq_s16(vacc2x1, vprod2x1);
+      const int8x8_t vb2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x2 = vmull_s8(vb2, va0);
+      const int16x8_t vprod1x2 = vmull_s8(vb2, va1);
+      const int16x8_t vprod2x2 = vmull_s8(vb2, va2);
+      vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+      vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2);
+      vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2);
+      const int8x8_t vb3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x3 = vmull_s8(vb3, va0);
+      const int16x8_t vprod1x3 = vmull_s8(vb3, va1);
+      const int16x8_t vprod2x3 = vmull_s8(vb3, va2);
+      vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+      vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3);
+      vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3);
+      const int8x8_t vb4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x4 = vmull_s8(vb4, va0);
+      const int16x8_t vprod1x4 = vmull_s8(vb4, va1);
+      const int16x8_t vprod2x4 = vmull_s8(vb4, va2);
+      vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+      vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4);
+      vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4);
+      const int8x8_t vb5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x5 = vmull_s8(vb5, va0);
+      const int16x8_t vprod1x5 = vmull_s8(vb5, va1);
+      const int16x8_t vprod2x5 = vmull_s8(vb5, va2);
+      vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+      vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5);
+      vacc2x5 = vpadalq_s16(vacc2x5, vprod2x5);
+      const int8x8_t vb6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x6 = vmull_s8(vb6, va0);
+      const int16x8_t vprod1x6 = vmull_s8(vb6, va1);
+      const int16x8_t vprod2x6 = vmull_s8(vb6, va2);
+      vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+      vacc1x6 = vpadalq_s16(vacc1x6, vprod1x6);
+      vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6);
+      const int8x8_t vb7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x7 = vmull_s8(vb7, va0);
+      const int16x8_t vprod1x7 = vmull_s8(vb7, va1);
+      const int16x8_t vprod2x7 = vmull_s8(vb7, va2);
+      vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+      vacc1x7 = vpadalq_s16(vacc1x7, vprod1x7);
+      vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7);
+      const int8x8_t vb8 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x8 = vmull_s8(vb8, va0);
+      const int16x8_t vprod1x8 = vmull_s8(vb8, va1);
+      const int16x8_t vprod2x8 = vmull_s8(vb8, va2);
+      vacc0x8 = vpadalq_s16(vacc0x8, vprod0x8);
+      vacc1x8 = vpadalq_s16(vacc1x8, vprod1x8);
+      vacc2x8 = vpadalq_s16(vacc2x8, vprod2x8);
+      const int8x8_t vb9 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x9 = vmull_s8(vb9, va0);
+      const int16x8_t vprod1x9 = vmull_s8(vb9, va1);
+      const int16x8_t vprod2x9 = vmull_s8(vb9, va2);
+      vacc0x9 = vpadalq_s16(vacc0x9, vprod0x9);
+      vacc1x9 = vpadalq_s16(vacc1x9, vprod1x9);
+      vacc2x9 = vpadalq_s16(vacc2x9, vprod2x9);
+      const int8x8_t vb10 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x10 = vmull_s8(vb10, va0);
+      const int16x8_t vprod1x10 = vmull_s8(vb10, va1);
+      const int16x8_t vprod2x10 = vmull_s8(vb10, va2);
+      vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10);
+      vacc1x10 = vpadalq_s16(vacc1x10, vprod1x10);
+      vacc2x10 = vpadalq_s16(vacc2x10, vprod2x10);
+      const int8x8_t vb11 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x11 = vmull_s8(vb11, va0);
+      const int16x8_t vprod1x11 = vmull_s8(vb11, va1);
+      const int16x8_t vprod2x11 = vmull_s8(vb11, va2);
+      vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11);
+      vacc1x11 = vpadalq_s16(vacc1x11, vprod1x11);
+      vacc2x11 = vpadalq_s16(vacc2x11, vprod2x11);
+      const int8x8_t vb12 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x12 = vmull_s8(vb12, va0);
+      const int16x8_t vprod1x12 = vmull_s8(vb12, va1);
+      const int16x8_t vprod2x12 = vmull_s8(vb12, va2);
+      vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12);
+      vacc1x12 = vpadalq_s16(vacc1x12, vprod1x12);
+      vacc2x12 = vpadalq_s16(vacc2x12, vprod2x12);
+      const int8x8_t vb13 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x13 = vmull_s8(vb13, va0);
+      const int16x8_t vprod1x13 = vmull_s8(vb13, va1);
+      const int16x8_t vprod2x13 = vmull_s8(vb13, va2);
+      vacc0x13 = vpadalq_s16(vacc0x13, vprod0x13);
+      vacc1x13 = vpadalq_s16(vacc1x13, vprod1x13);
+      vacc2x13 = vpadalq_s16(vacc2x13, vprod2x13);
+      const int8x8_t vb14 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x14 = vmull_s8(vb14, va0);
+      const int16x8_t vprod1x14 = vmull_s8(vb14, va1);
+      const int16x8_t vprod2x14 = vmull_s8(vb14, va2);
+      vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14);
+      vacc1x14 = vpadalq_s16(vacc1x14, vprod1x14);
+      vacc2x14 = vpadalq_s16(vacc2x14, vprod2x14);
+      const int8x8_t vb15 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x15 = vmull_s8(vb15, va0);
+      const int16x8_t vprod1x15 = vmull_s8(vb15, va1);
+      const int16x8_t vprod2x15 = vmull_s8(vb15, va2);
+      vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15);
+      vacc1x15 = vpadalq_s16(vacc1x15, vprod1x15);
+      vacc2x15 = vpadalq_s16(vacc2x15, vprod2x15);
+
+      k -= 8 * sizeof(int8_t);
+    }
+
+#if XNN_ARCH_ARM64
+    const int32x4_t vsum0x01 = vpaddq_s32(vacc0x0, vacc0x1);
+    const int32x4_t vsum0x23 = vpaddq_s32(vacc0x2, vacc0x3);
+    const int32x4_t vsum0x45 = vpaddq_s32(vacc0x4, vacc0x5);
+    const int32x4_t vsum0x67 = vpaddq_s32(vacc0x6, vacc0x7);
+    const int32x4_t vsum0x89 = vpaddq_s32(vacc0x8, vacc0x9);
+    const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11);
+    const int32x4_t vsum0xCD = vpaddq_s32(vacc0x12, vacc0x13);
+    const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15);
+    const int32x4_t vsum1x01 = vpaddq_s32(vacc1x0, vacc1x1);
+    const int32x4_t vsum1x23 = vpaddq_s32(vacc1x2, vacc1x3);
+    const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5);
+    const int32x4_t vsum1x67 = vpaddq_s32(vacc1x6, vacc1x7);
+    const int32x4_t vsum1x89 = vpaddq_s32(vacc1x8, vacc1x9);
+    const int32x4_t vsum1xAB = vpaddq_s32(vacc1x10, vacc1x11);
+    const int32x4_t vsum1xCD = vpaddq_s32(vacc1x12, vacc1x13);
+    const int32x4_t vsum1xEF = vpaddq_s32(vacc1x14, vacc1x15);
+    const int32x4_t vsum2x01 = vpaddq_s32(vacc2x0, vacc2x1);
+    const int32x4_t vsum2x23 = vpaddq_s32(vacc2x2, vacc2x3);
+    const int32x4_t vsum2x45 = vpaddq_s32(vacc2x4, vacc2x5);
+    const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7);
+    const int32x4_t vsum2x89 = vpaddq_s32(vacc2x8, vacc2x9);
+    const int32x4_t vsum2xAB = vpaddq_s32(vacc2x10, vacc2x11);
+    const int32x4_t vsum2xCD = vpaddq_s32(vacc2x12, vacc2x13);
+    const int32x4_t vsum2xEF = vpaddq_s32(vacc2x14, vacc2x15);
+    int32x4_t vacc0x0123 = vpaddq_s32(vsum0x01, vsum0x23);
+    int32x4_t vacc0x4567 = vpaddq_s32(vsum0x45, vsum0x67);
+    int32x4_t vacc0x89AB = vpaddq_s32(vsum0x89, vsum0xAB);
+    int32x4_t vacc0xCDEF = vpaddq_s32(vsum0xCD, vsum0xEF);
+    int32x4_t vacc1x0123 = vpaddq_s32(vsum1x01, vsum1x23);
+    int32x4_t vacc1x4567 = vpaddq_s32(vsum1x45, vsum1x67);
+    int32x4_t vacc1x89AB = vpaddq_s32(vsum1x89, vsum1xAB);
+    int32x4_t vacc1xCDEF = vpaddq_s32(vsum1xCD, vsum1xEF);
+    int32x4_t vacc2x0123 = vpaddq_s32(vsum2x01, vsum2x23);
+    int32x4_t vacc2x4567 = vpaddq_s32(vsum2x45, vsum2x67);
+    int32x4_t vacc2x89AB = vpaddq_s32(vsum2x89, vsum2xAB);
+    int32x4_t vacc2xCDEF = vpaddq_s32(vsum2xCD, vsum2xEF);
+#else
+    const int32x2_t vpsum0x0 = vadd_s32(vget_low_s32(vacc0x0), vget_high_s32(vacc0x0));
+    const int32x2_t vpsum0x1 = vadd_s32(vget_low_s32(vacc0x1), vget_high_s32(vacc0x1));
+    const int32x2_t vpsum0x2 = vadd_s32(vget_low_s32(vacc0x2), vget_high_s32(vacc0x2));
+    const int32x2_t vpsum0x3 = vadd_s32(vget_low_s32(vacc0x3), vget_high_s32(vacc0x3));
+    const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1);
+    const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3);
+    int32x4_t vacc0x0123 = vcombine_s32(vsum0x01, vsum0x23 );
+    const int32x2_t vpsum0x4 = vadd_s32(vget_low_s32(vacc0x4), vget_high_s32(vacc0x4));
+    const int32x2_t vpsum0x5 = vadd_s32(vget_low_s32(vacc0x5), vget_high_s32(vacc0x5));
+    const int32x2_t vpsum0x6 = vadd_s32(vget_low_s32(vacc0x6), vget_high_s32(vacc0x6));
+    const int32x2_t vpsum0x7 = vadd_s32(vget_low_s32(vacc0x7), vget_high_s32(vacc0x7));
+    const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5);
+    const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7);
+    int32x4_t vacc0x4567 = vcombine_s32(vsum0x45, vsum0x67 );
+    const int32x2_t vpsum0x8 = vadd_s32(vget_low_s32(vacc0x8), vget_high_s32(vacc0x8));
+    const int32x2_t vpsum0x9 = vadd_s32(vget_low_s32(vacc0x9), vget_high_s32(vacc0x9));
+    const int32x2_t vpsum0xA = vadd_s32(vget_low_s32(vacc0x10), vget_high_s32(vacc0x10));
+    const int32x2_t vpsum0xB = vadd_s32(vget_low_s32(vacc0x11), vget_high_s32(vacc0x11));
+    const int32x2_t vsum0x89 = vpadd_s32(vpsum0x8, vpsum0x9);
+    const int32x2_t vsum0xAB = vpadd_s32(vpsum0xA, vpsum0xB);
+    int32x4_t vacc0x89AB = vcombine_s32(vsum0x89, vsum0xAB );
+    const int32x2_t vpsum0xC = vadd_s32(vget_low_s32(vacc0x12), vget_high_s32(vacc0x12));
+    const int32x2_t vpsum0xD = vadd_s32(vget_low_s32(vacc0x13), vget_high_s32(vacc0x13));
+    const int32x2_t vpsum0xE = vadd_s32(vget_low_s32(vacc0x14), vget_high_s32(vacc0x14));
+    const int32x2_t vpsum0xF = vadd_s32(vget_low_s32(vacc0x15), vget_high_s32(vacc0x15));
+    const int32x2_t vsum0xCD = vpadd_s32(vpsum0xC, vpsum0xD);
+    const int32x2_t vsum0xEF = vpadd_s32(vpsum0xE, vpsum0xF);
+    int32x4_t vacc0xCDEF = vcombine_s32(vsum0xCD, vsum0xEF );
+    const int32x2_t vpsum1x0 = vadd_s32(vget_low_s32(vacc1x0), vget_high_s32(vacc1x0));
+    const int32x2_t vpsum1x1 = vadd_s32(vget_low_s32(vacc1x1), vget_high_s32(vacc1x1));
+    const int32x2_t vpsum1x2 = vadd_s32(vget_low_s32(vacc1x2), vget_high_s32(vacc1x2));
+    const int32x2_t vpsum1x3 = vadd_s32(vget_low_s32(vacc1x3), vget_high_s32(vacc1x3));
+    const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1);
+    const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3);
+    int32x4_t vacc1x0123 = vcombine_s32(vsum1x01, vsum1x23 );
+    const int32x2_t vpsum1x4 = vadd_s32(vget_low_s32(vacc1x4), vget_high_s32(vacc1x4));
+    const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5));
+    const int32x2_t vpsum1x6 = vadd_s32(vget_low_s32(vacc1x6), vget_high_s32(vacc1x6));
+    const int32x2_t vpsum1x7 = vadd_s32(vget_low_s32(vacc1x7), vget_high_s32(vacc1x7));
+    const int32x2_t vsum1x45 = vpadd_s32(vpsum1x4, vpsum1x5);
+    const int32x2_t vsum1x67 = vpadd_s32(vpsum1x6, vpsum1x7);
+    int32x4_t vacc1x4567 = vcombine_s32(vsum1x45, vsum1x67 );
+    const int32x2_t vpsum1x8 = vadd_s32(vget_low_s32(vacc1x8), vget_high_s32(vacc1x8));
+    const int32x2_t vpsum1x9 = vadd_s32(vget_low_s32(vacc1x9), vget_high_s32(vacc1x9));
+    const int32x2_t vpsum1xA = vadd_s32(vget_low_s32(vacc1x10), vget_high_s32(vacc1x10));
+    const int32x2_t vpsum1xB = vadd_s32(vget_low_s32(vacc1x11), vget_high_s32(vacc1x11));
+    const int32x2_t vsum1x89 = vpadd_s32(vpsum1x8, vpsum1x9);
+    const int32x2_t vsum1xAB = vpadd_s32(vpsum1xA, vpsum1xB);
+    int32x4_t vacc1x89AB = vcombine_s32(vsum1x89, vsum1xAB );
+    const int32x2_t vpsum1xC = vadd_s32(vget_low_s32(vacc1x12), vget_high_s32(vacc1x12));
+    const int32x2_t vpsum1xD = vadd_s32(vget_low_s32(vacc1x13), vget_high_s32(vacc1x13));
+    const int32x2_t vpsum1xE = vadd_s32(vget_low_s32(vacc1x14), vget_high_s32(vacc1x14));
+    const int32x2_t vpsum1xF = vadd_s32(vget_low_s32(vacc1x15), vget_high_s32(vacc1x15));
+    const int32x2_t vsum1xCD = vpadd_s32(vpsum1xC, vpsum1xD);
+    const int32x2_t vsum1xEF = vpadd_s32(vpsum1xE, vpsum1xF);
+    int32x4_t vacc1xCDEF = vcombine_s32(vsum1xCD, vsum1xEF );
+    const int32x2_t vpsum2x0 = vadd_s32(vget_low_s32(vacc2x0), vget_high_s32(vacc2x0));
+    const int32x2_t vpsum2x1 = vadd_s32(vget_low_s32(vacc2x1), vget_high_s32(vacc2x1));
+    const int32x2_t vpsum2x2 = vadd_s32(vget_low_s32(vacc2x2), vget_high_s32(vacc2x2));
+    const int32x2_t vpsum2x3 = vadd_s32(vget_low_s32(vacc2x3), vget_high_s32(vacc2x3));
+    const int32x2_t vsum2x01 = vpadd_s32(vpsum2x0, vpsum2x1);
+    const int32x2_t vsum2x23 = vpadd_s32(vpsum2x2, vpsum2x3);
+    int32x4_t vacc2x0123 = vcombine_s32(vsum2x01, vsum2x23 );
+    const int32x2_t vpsum2x4 = vadd_s32(vget_low_s32(vacc2x4), vget_high_s32(vacc2x4));
+    const int32x2_t vpsum2x5 = vadd_s32(vget_low_s32(vacc2x5), vget_high_s32(vacc2x5));
+    const int32x2_t vpsum2x6 = vadd_s32(vget_low_s32(vacc2x6), vget_high_s32(vacc2x6));
+    const int32x2_t vpsum2x7 = vadd_s32(vget_low_s32(vacc2x7), vget_high_s32(vacc2x7));
+    const int32x2_t vsum2x45 = vpadd_s32(vpsum2x4, vpsum2x5);
+    const int32x2_t vsum2x67 = vpadd_s32(vpsum2x6, vpsum2x7);
+    int32x4_t vacc2x4567 = vcombine_s32(vsum2x45, vsum2x67 );
+    const int32x2_t vpsum2x8 = vadd_s32(vget_low_s32(vacc2x8), vget_high_s32(vacc2x8));
+    const int32x2_t vpsum2x9 = vadd_s32(vget_low_s32(vacc2x9), vget_high_s32(vacc2x9));
+    const int32x2_t vpsum2xA = vadd_s32(vget_low_s32(vacc2x10), vget_high_s32(vacc2x10));
+    const int32x2_t vpsum2xB = vadd_s32(vget_low_s32(vacc2x11), vget_high_s32(vacc2x11));
+    const int32x2_t vsum2x89 = vpadd_s32(vpsum2x8, vpsum2x9);
+    const int32x2_t vsum2xAB = vpadd_s32(vpsum2xA, vpsum2xB);
+    int32x4_t vacc2x89AB = vcombine_s32(vsum2x89, vsum2xAB );
+    const int32x2_t vpsum2xC = vadd_s32(vget_low_s32(vacc2x12), vget_high_s32(vacc2x12));
+    const int32x2_t vpsum2xD = vadd_s32(vget_low_s32(vacc2x13), vget_high_s32(vacc2x13));
+    const int32x2_t vpsum2xE = vadd_s32(vget_low_s32(vacc2x14), vget_high_s32(vacc2x14));
+    const int32x2_t vpsum2xF = vadd_s32(vget_low_s32(vacc2x15), vget_high_s32(vacc2x15));
+    const int32x2_t vsum2xCD = vpadd_s32(vpsum2xC, vpsum2xD);
+    const int32x2_t vsum2xEF = vpadd_s32(vpsum2xE, vpsum2xF);
+    int32x4_t vacc2xCDEF = vcombine_s32(vsum2xCD, vsum2xEF );
+#endif
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+    vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc1x89AB = vqrdmulhq_s32(vacc1x89AB, vmultiplier);
+    vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+    vacc2x89AB = vqrdmulhq_s32(vacc2x89AB, vmultiplier);
+    vacc2xCDEF = vqrdmulhq_s32(vacc2xCDEF, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+    vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc1x89AB = vsraq_n_s32(vacc1x89AB, vbicq_s32(vacc1x89AB, vzero_shift_mask), 31);
+    vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+    vacc2x89AB = vsraq_n_s32(vacc2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31);
+    vacc2xCDEF = vsraq_n_s32(vacc2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_shift);
+    vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+    vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_shift);
+    vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), voutput_zero_point);
+    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+    int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
+    int8x16_t vout2x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc2x89ABCDEF);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+    int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
+    int8x16_t vout2x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc2x89ABCDEF));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+    vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
+    vout2x0123456789ABCDEF = vmaxq_s8(vout2x0123456789ABCDEF, voutput_min);
+
+    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+    vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
+    vout2x0123456789ABCDEF = vminq_s8(vout2x0123456789ABCDEF, voutput_max);
+
+    if (nc >= 16) {
+      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+      vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
+      vst1q_s8(c2 + 0, vout2x0123456789ABCDEF);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
+      nc -= 16;
+    } else {
+      int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
+      int8x8_t vout2x01234567 = vget_low_s8(vout2x0123456789ABCDEF);
+      if (nc & 8) {
+        vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
+        vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
+        vst1_s8(c2, vout2x01234567); c2 += 8;
+        vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
+        vout2x01234567 = vget_high_s8(vout2x0123456789ABCDEF);
+      }
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1_lane_s8(c2, vout2x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-sse2-ld128.c b/src/qs8-gemm/gen/3x4c8-minmax-sse2-ld128.c
index 709ef30..3e40c43 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-sse2-ld128.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-sse2-ld128.c
@@ -12,6 +12,7 @@
 #include <emmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_3x4c8__sse2_ld128(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -208,14 +210,14 @@
       vout = _mm_srli_si128(vout, 4);
       *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-      a2 = (const int8_t*) ((uintptr_t) a2 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-sse2-ld64.c b/src/qs8-gemm/gen/3x4c8-minmax-sse2-ld64.c
index c01210e..ec8e708 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-sse2-ld64.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-sse2-ld64.c
@@ -12,6 +12,7 @@
 #include <emmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_3x4c8__sse2_ld64(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -210,14 +212,14 @@
       vout = _mm_srli_si128(vout, 4);
       *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-      a2 = (const int8_t*) ((uintptr_t) a2 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-sse41-ld128.c b/src/qs8-gemm/gen/3x4c8-minmax-sse41-ld128.c
index fe38e93..eefed28 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-sse41-ld128.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-sse41-ld128.c
@@ -12,6 +12,7 @@
 #include <smmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -174,14 +176,14 @@
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
       *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-      a2 = (const int8_t*) ((uintptr_t) a2 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-sse41-ld64.c b/src/qs8-gemm/gen/3x4c8-minmax-sse41-ld64.c
index f511ba1..1fac46b 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-sse41-ld64.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-sse41-ld64.c
@@ -12,6 +12,7 @@
 #include <smmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -176,14 +178,14 @@
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
       *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-      a2 = (const int8_t*) ((uintptr_t) a2 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-ssse3-ld128.c b/src/qs8-gemm/gen/3x4c8-minmax-ssse3-ld128.c
index 166acff..d5dedb7 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-ssse3-ld128.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-ssse3-ld128.c
@@ -12,6 +12,7 @@
 #include <tmmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_3x4c8__ssse3_ld128(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -208,14 +210,14 @@
       vout = _mm_srli_si128(vout, 4);
       *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-      a2 = (const int8_t*) ((uintptr_t) a2 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-ssse3-ld64.c b/src/qs8-gemm/gen/3x4c8-minmax-ssse3-ld64.c
index 255c8e2..661e5ec 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-ssse3-ld64.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-ssse3-ld64.c
@@ -12,6 +12,7 @@
 #include <tmmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_3x4c8__ssse3_ld64(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -210,14 +212,14 @@
       vout = _mm_srli_si128(vout, 4);
       *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-      a2 = (const int8_t*) ((uintptr_t) a2 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-wasmsimd-ld128.c b/src/qs8-gemm/gen/3x4c8-minmax-wasmsimd-ld128.c
index 8a037ed..138f883 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-wasmsimd-ld128.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-wasmsimd-ld128.c
@@ -12,6 +12,7 @@
 #include <wasm_simd128.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_3x4c8__wasmsimd_ld128(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -192,14 +194,14 @@
       *((float*) c1) = (float) wasm_f32x4_extract_lane(vout, 1);
       *((float*) c2) = (float) wasm_f32x4_extract_lane(vout, 2);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-      a2 = (const int8_t*) ((uintptr_t) a2 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-wasmsimd-ld64.c b/src/qs8-gemm/gen/3x4c8-minmax-wasmsimd-ld64.c
index f63c0f0..63303a9 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-wasmsimd-ld64.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-wasmsimd-ld64.c
@@ -12,6 +12,7 @@
 #include <wasm_simd128.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_3x4c8__wasmsimd_ld64(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -188,14 +190,14 @@
       *((float*) c1) = (float) wasm_f32x4_extract_lane(vout, 1);
       *((float*) c2) = (float) wasm_f32x4_extract_lane(vout, 2);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-      a2 = (const int8_t*) ((uintptr_t) a2 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-xop-ld128.c b/src/qs8-gemm/gen/3x4c8-minmax-xop-ld128.c
index f3beb05..df0e5d5 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-xop-ld128.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-xop-ld128.c
@@ -17,6 +17,7 @@
 #endif
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128(
@@ -40,6 +41,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -179,14 +181,14 @@
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
       *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-      a2 = (const int8_t*) ((uintptr_t) a2 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-xop-ld64.c b/src/qs8-gemm/gen/3x4c8-minmax-xop-ld64.c
index f82a244..090bd73 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-xop-ld64.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-xop-ld64.c
@@ -17,6 +17,7 @@
 #endif
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64(
@@ -40,6 +41,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -181,14 +183,14 @@
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
       *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-      a2 = (const int8_t*) ((uintptr_t) a2 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/3x4c8-xw-minmax-sse2.c b/src/qs8-gemm/gen/3x4c8-xw-minmax-sse2.c
index 1da482f..eb77d6e 100644
--- a/src/qs8-gemm/gen/3x4c8-xw-minmax-sse2.c
+++ b/src/qs8-gemm/gen/3x4c8-xw-minmax-sse2.c
@@ -12,6 +12,7 @@
 #include <emmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse2(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -206,14 +208,14 @@
       vout = _mm_srli_si128(vout, 4);
       *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-      a2 = (const int8_t*) ((uintptr_t) a2 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/3x4c8-xw-minmax-sse41.c b/src/qs8-gemm/gen/3x4c8-xw-minmax-sse41.c
index 27ec277..167ab31 100644
--- a/src/qs8-gemm/gen/3x4c8-xw-minmax-sse41.c
+++ b/src/qs8-gemm/gen/3x4c8-xw-minmax-sse41.c
@@ -12,6 +12,7 @@
 #include <smmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -172,14 +174,14 @@
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
       *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-      a2 = (const int8_t*) ((uintptr_t) a2 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/3x4c8-xw-minmax-ssse3.c b/src/qs8-gemm/gen/3x4c8-xw-minmax-ssse3.c
index 93fe44b..d299855 100644
--- a/src/qs8-gemm/gen/3x4c8-xw-minmax-ssse3.c
+++ b/src/qs8-gemm/gen/3x4c8-xw-minmax-ssse3.c
@@ -12,6 +12,7 @@
 #include <tmmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__ssse3(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -206,14 +208,14 @@
       vout = _mm_srli_si128(vout, 4);
       *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-      a2 = (const int8_t*) ((uintptr_t) a2 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/3x4c8-xw-minmax-wasmsimd.c b/src/qs8-gemm/gen/3x4c8-xw-minmax-wasmsimd.c
index 4258337..7cc2d01 100644
--- a/src/qs8-gemm/gen/3x4c8-xw-minmax-wasmsimd.c
+++ b/src/qs8-gemm/gen/3x4c8-xw-minmax-wasmsimd.c
@@ -12,6 +12,7 @@
 #include <wasm_simd128.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__wasmsimd(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -188,14 +190,14 @@
       *((float*) c1) = (float) wasm_f32x4_extract_lane(vout, 1);
       *((float*) c2) = (float) wasm_f32x4_extract_lane(vout, 2);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-      a2 = (const int8_t*) ((uintptr_t) a2 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/3x4c8-xw-minmax-xop.c b/src/qs8-gemm/gen/3x4c8-xw-minmax-xop.c
index 0f8932b..79cdbf1 100644
--- a/src/qs8-gemm/gen/3x4c8-xw-minmax-xop.c
+++ b/src/qs8-gemm/gen/3x4c8-xw-minmax-xop.c
@@ -17,6 +17,7 @@
 #endif
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop(
@@ -40,6 +41,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -177,14 +179,14 @@
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
       *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-      a2 = (const int8_t*) ((uintptr_t) a2 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/3x8-minmax-neon-mlal-lane.c b/src/qs8-gemm/gen/3x8-minmax-neon-mlal-lane.c
new file mode 100644
index 0000000..224000d
--- /dev/null
+++ b/src/qs8-gemm/gen/3x8-minmax-neon-mlal-lane.c
@@ -0,0 +1,325 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/neon-mlal-lane.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/gemm.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc1x0123 = vacc0x0123;
+    int32x4_t vacc1x4567 = vacc0x4567;
+    int32x4_t vacc2x0123 = vacc0x0123;
+    int32x4_t vacc2x4567 = vacc0x4567;
+
+    size_t k = kc;
+    while (k >= 8 * sizeof(int8_t)) {
+      const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+      const int16x8_t vxa0 = vmovl_s8(va0);
+      const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+      const int16x8_t vxa1 = vmovl_s8(va1);
+      const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+      const int16x8_t vxa2 = vmovl_s8(va2);
+
+      const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb01234567c0 = vmovl_s8(vb01234567c0);
+
+      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
+      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
+      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
+      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
+      const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb01234567c1 = vmovl_s8(vb01234567c1);
+
+      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
+      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
+      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
+      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
+      const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb01234567c2 = vmovl_s8(vb01234567c2);
+
+      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
+      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
+      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
+      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
+      const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb01234567c3 = vmovl_s8(vb01234567c3);
+
+      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
+      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
+      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
+      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
+
+      const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb01234567c4 = vmovl_s8(vb01234567c4);
+
+      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
+      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
+      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
+      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
+      const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb01234567c5 = vmovl_s8(vb01234567c5);
+
+      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
+      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
+      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
+      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
+      const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb01234567c6 = vmovl_s8(vb01234567c6);
+
+      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
+      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
+      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
+      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
+      const int8x8_t vb01234567c7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb01234567c7 = vmovl_s8(vb01234567c7);
+
+      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
+      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
+      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa1), 3);
+      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa1), 3);
+      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa2), 3);
+      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa2), 3);
+
+      k -= 8 * sizeof(int8_t);
+    }
+    if XNN_UNLIKELY(k != 0) {
+      const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+      const int16x8_t vxa0 = vmovl_s8(va0);
+      const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+      const int16x8_t vxa1 = vmovl_s8(va1);
+      const int8x8_t va2 = vld1_s8(a2); a2 = (const int8_t*) ((uintptr_t) a2 + k);
+      const int16x8_t vxa2 = vmovl_s8(va2);
+
+      const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb01234567c0 = vmovl_s8(vb01234567c0);
+
+      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
+      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
+      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
+      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
+
+      if (k >= 2 * sizeof(int8_t)) {
+        const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb01234567c1 = vmovl_s8(vb01234567c1);
+
+        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
+        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
+        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
+        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
+
+        if (k > 2 * sizeof(int8_t)) {
+          const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int16x8_t vxb01234567c2 = vmovl_s8(vb01234567c2);
+
+          vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+          vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+          vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
+          vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
+          vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
+          vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
+
+          if (k >= 4 * sizeof(int8_t)) {
+            const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            const int16x8_t vxb01234567c3 = vmovl_s8(vb01234567c3);
+
+            vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+            vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+            vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
+            vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
+            vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
+            vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
+
+            if (k > 4 * sizeof(int8_t)) {
+              const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+              const int16x8_t vxb01234567c4 = vmovl_s8(vb01234567c4);
+
+              vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+              vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+              vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
+              vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
+              vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
+              vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
+
+              if (k >= 6 * sizeof(int8_t)) {
+                const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+                const int16x8_t vxb01234567c5 = vmovl_s8(vb01234567c5);
+
+                vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+                vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+                vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
+                vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
+                vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
+                vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
+
+                if (k > 6 * sizeof(int8_t)) {
+                  const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+                  const int16x8_t vxb01234567c6 = vmovl_s8(vb01234567c6);
+
+                  vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+                  vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+                  vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
+                  vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
+                  vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
+                  vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
+    int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
+    int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567);
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);
+    vout2x01234567 = vmax_s8(vout2x01234567, vget_low_s8(voutput_min));
+
+    vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);
+    vout2x01234567 = vmin_s8(vout2x01234567, vget_low_s8(voutput_max));
+
+    if (nc >= 8) {
+      vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
+      vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
+      vst1_s8(c2 + 0, vout2x01234567);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1_lane_s8(c2, vout2x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/3x8-minmax-neon-mull-addw-dup.c b/src/qs8-gemm/gen/3x8-minmax-neon-mull-addw-dup.c
new file mode 100644
index 0000000..b66556d
--- /dev/null
+++ b/src/qs8-gemm/gen/3x8-minmax-neon-mull-addw-dup.c
@@ -0,0 +1,347 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/neon-mull-addw-dup.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/gemm.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc1x0123 = vacc0x0123;
+    int32x4_t vacc1x4567 = vacc0x4567;
+    int32x4_t vacc2x0123 = vacc0x0123;
+    int32x4_t vacc2x4567 = vacc0x4567;
+
+    size_t k = kc;
+    while (k >= 8 * sizeof(int8_t)) {
+      const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+      const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+
+      const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c0));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c0));
+      const int16x8_t vprod1x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va1, 0));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c0));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c0));
+      const int16x8_t vprod2x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va2, 0));
+      vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c0));
+      vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c0));
+      const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c1));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c1));
+      const int16x8_t vprod1x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va1, 1));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c1));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c1));
+      const int16x8_t vprod2x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va2, 1));
+      vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c1));
+      vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c1));
+      const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va0, 2));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c2));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c2));
+      const int16x8_t vprod1x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va1, 2));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c2));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c2));
+      const int16x8_t vprod2x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va2, 2));
+      vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c2));
+      vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c2));
+      const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3));
+      const int16x8_t vprod1x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va1, 3));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c3));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c3));
+      const int16x8_t vprod2x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va2, 3));
+      vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c3));
+      vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c3));
+      const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va0, 4));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c4));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c4));
+      const int16x8_t vprod1x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va1, 4));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c4));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c4));
+      const int16x8_t vprod2x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va2, 4));
+      vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c4));
+      vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c4));
+      const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va0, 5));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c5));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c5));
+      const int16x8_t vprod1x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va1, 5));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c5));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c5));
+      const int16x8_t vprod2x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va2, 5));
+      vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c5));
+      vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c5));
+      const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va0, 6));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c6));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c6));
+      const int16x8_t vprod1x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va1, 6));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c6));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c6));
+      const int16x8_t vprod2x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va2, 6));
+      vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c6));
+      vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c6));
+      const int8x8_t vb01234567c7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va0, 7));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c7));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c7));
+      const int16x8_t vprod1x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va1, 7));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c7));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c7));
+      const int16x8_t vprod2x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va2, 7));
+      vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c7));
+      vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c7));
+
+      k -= 8 * sizeof(int8_t);
+    }
+    if XNN_UNLIKELY(k != 0) {
+      const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+      const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+      const int8x8_t va2 = vld1_s8(a2); a2 = (const int8_t*) ((uintptr_t) a2 + k);
+
+      const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c0));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c0));
+      const int16x8_t vprod1x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va1, 0));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c0));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c0));
+      const int16x8_t vprod2x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va2, 0));
+      vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c0));
+      vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c0));
+
+      if (k >= 2 * sizeof(int8_t)) {
+        const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c1));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c1));
+        const int16x8_t vprod1x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va1, 1));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c1));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c1));
+        const int16x8_t vprod2x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va2, 1));
+        vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c1));
+        vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c1));
+
+        if (k > 2 * sizeof(int8_t)) {
+          const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          const int16x8_t vprod0x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va0, 2));
+          vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c2));
+          vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c2));
+          const int16x8_t vprod1x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va1, 2));
+          vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c2));
+          vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c2));
+          const int16x8_t vprod2x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va2, 2));
+          vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c2));
+          vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c2));
+
+          if (k >= 4 * sizeof(int8_t)) {
+            const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+            const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3));
+            vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3));
+            vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3));
+            const int16x8_t vprod1x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va1, 3));
+            vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c3));
+            vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c3));
+            const int16x8_t vprod2x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va2, 3));
+            vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c3));
+            vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c3));
+
+            if (k > 4 * sizeof(int8_t)) {
+              const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+              const int16x8_t vprod0x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va0, 4));
+              vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c4));
+              vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c4));
+              const int16x8_t vprod1x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va1, 4));
+              vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c4));
+              vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c4));
+              const int16x8_t vprod2x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va2, 4));
+              vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c4));
+              vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c4));
+
+              if (k >= 6 * sizeof(int8_t)) {
+                const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+                const int16x8_t vprod0x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va0, 5));
+                vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c5));
+                vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c5));
+                const int16x8_t vprod1x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va1, 5));
+                vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c5));
+                vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c5));
+                const int16x8_t vprod2x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va2, 5));
+                vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c5));
+                vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c5));
+
+                if (k > 6 * sizeof(int8_t)) {
+                  const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+                  const int16x8_t vprod0x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va0, 6));
+                  vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c6));
+                  vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c6));
+                  const int16x8_t vprod1x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va1, 6));
+                  vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c6));
+                  vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c6));
+                  const int16x8_t vprod2x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va2, 6));
+                  vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c6));
+                  vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c6));
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
+    int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
+    int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567);
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);
+    vout2x01234567 = vmax_s8(vout2x01234567, vget_low_s8(voutput_min));
+
+    vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);
+    vout2x01234567 = vmin_s8(vout2x01234567, vget_low_s8(voutput_max));
+
+    if (nc >= 8) {
+      vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
+      vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
+      vst1_s8(c2 + 0, vout2x01234567);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1_lane_s8(c2, vout2x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/3x8c16-minmax-neon-mlal-padal.c b/src/qs8-gemm/gen/3x8c16-minmax-neon-mlal-padal.c
new file mode 100644
index 0000000..0b18356
--- /dev/null
+++ b/src/qs8-gemm/gen/3x8c16-minmax-neon-mlal-padal.c
@@ -0,0 +1,323 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/c16-neon-mlal-padal.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 16);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+
+  do {
+    int32x4_t vacc0x0 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x1 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x2 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x3 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x4 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x5 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc1x0 = vacc0x0;
+    int32x4_t vacc1x1 = vacc0x1;
+    int32x4_t vacc1x2 = vacc0x2;
+    int32x4_t vacc1x3 = vacc0x3;
+    int32x4_t vacc1x4 = vacc0x4;
+    int32x4_t vacc1x5 = vacc0x5;
+    int32x4_t vacc1x6 = vacc0x6;
+    int32x4_t vacc1x7 = vacc0x7;
+    int32x4_t vacc2x0 = vacc0x0;
+    int32x4_t vacc2x1 = vacc0x1;
+    int32x4_t vacc2x2 = vacc0x2;
+    int32x4_t vacc2x3 = vacc0x3;
+    int32x4_t vacc2x4 = vacc0x4;
+    int32x4_t vacc2x5 = vacc0x5;
+    int32x4_t vacc2x6 = vacc0x6;
+    int32x4_t vacc2x7 = vacc0x7;
+
+    // KC loop of 16
+    size_t k = 0;
+    while (k < kc) {
+      const int8x16_t va0 = vld1q_s8(a0); a0 += 16;
+      const int8x16_t va1 = vld1q_s8(a1); a1 += 16;
+      const int8x16_t va2 = vld1q_s8(a2); a2 += 16;
+
+      const int8x16_t vb0 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb1 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb2 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb3 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb4 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb5 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb6 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb7 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+
+      int16x8_t vprod0x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va0));
+      int16x8_t vprod1x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va1));
+      int16x8_t vprod2x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va2));
+      vprod0x0 = vmlal_s8(vprod0x0, vget_high_s8(vb0), vget_high_s8(va0));
+      vprod1x0 = vmlal_s8(vprod1x0, vget_high_s8(vb0), vget_high_s8(va1));
+      vprod2x0 = vmlal_s8(vprod2x0, vget_high_s8(vb0), vget_high_s8(va2));
+      vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+      vacc1x0 = vpadalq_s16(vacc1x0, vprod1x0);
+      vacc2x0 = vpadalq_s16(vacc2x0, vprod2x0);
+      int16x8_t vprod0x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va0));
+      int16x8_t vprod1x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va1));
+      int16x8_t vprod2x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va2));
+      vprod0x1 = vmlal_s8(vprod0x1, vget_high_s8(vb1), vget_high_s8(va0));
+      vprod1x1 = vmlal_s8(vprod1x1, vget_high_s8(vb1), vget_high_s8(va1));
+      vprod2x1 = vmlal_s8(vprod2x1, vget_high_s8(vb1), vget_high_s8(va2));
+      vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+      vacc1x1 = vpadalq_s16(vacc1x1, vprod1x1);
+      vacc2x1 = vpadalq_s16(vacc2x1, vprod2x1);
+      int16x8_t vprod0x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va0));
+      int16x8_t vprod1x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va1));
+      int16x8_t vprod2x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va2));
+      vprod0x2 = vmlal_s8(vprod0x2, vget_high_s8(vb2), vget_high_s8(va0));
+      vprod1x2 = vmlal_s8(vprod1x2, vget_high_s8(vb2), vget_high_s8(va1));
+      vprod2x2 = vmlal_s8(vprod2x2, vget_high_s8(vb2), vget_high_s8(va2));
+      vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+      vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2);
+      vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2);
+      int16x8_t vprod0x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va0));
+      int16x8_t vprod1x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va1));
+      int16x8_t vprod2x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va2));
+      vprod0x3 = vmlal_s8(vprod0x3, vget_high_s8(vb3), vget_high_s8(va0));
+      vprod1x3 = vmlal_s8(vprod1x3, vget_high_s8(vb3), vget_high_s8(va1));
+      vprod2x3 = vmlal_s8(vprod2x3, vget_high_s8(vb3), vget_high_s8(va2));
+      vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+      vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3);
+      vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3);
+      int16x8_t vprod0x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va0));
+      int16x8_t vprod1x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va1));
+      int16x8_t vprod2x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va2));
+      vprod0x4 = vmlal_s8(vprod0x4, vget_high_s8(vb4), vget_high_s8(va0));
+      vprod1x4 = vmlal_s8(vprod1x4, vget_high_s8(vb4), vget_high_s8(va1));
+      vprod2x4 = vmlal_s8(vprod2x4, vget_high_s8(vb4), vget_high_s8(va2));
+      vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+      vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4);
+      vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4);
+      int16x8_t vprod0x5 = vmull_s8(vget_low_s8(vb5), vget_low_s8(va0));
+      int16x8_t vprod1x5 = vmull_s8(vget_low_s8(vb5), vget_low_s8(va1));
+      int16x8_t vprod2x5 = vmull_s8(vget_low_s8(vb5), vget_low_s8(va2));
+      vprod0x5 = vmlal_s8(vprod0x5, vget_high_s8(vb5), vget_high_s8(va0));
+      vprod1x5 = vmlal_s8(vprod1x5, vget_high_s8(vb5), vget_high_s8(va1));
+      vprod2x5 = vmlal_s8(vprod2x5, vget_high_s8(vb5), vget_high_s8(va2));
+      vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+      vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5);
+      vacc2x5 = vpadalq_s16(vacc2x5, vprod2x5);
+      int16x8_t vprod0x6 = vmull_s8(vget_low_s8(vb6), vget_low_s8(va0));
+      int16x8_t vprod1x6 = vmull_s8(vget_low_s8(vb6), vget_low_s8(va1));
+      int16x8_t vprod2x6 = vmull_s8(vget_low_s8(vb6), vget_low_s8(va2));
+      vprod0x6 = vmlal_s8(vprod0x6, vget_high_s8(vb6), vget_high_s8(va0));
+      vprod1x6 = vmlal_s8(vprod1x6, vget_high_s8(vb6), vget_high_s8(va1));
+      vprod2x6 = vmlal_s8(vprod2x6, vget_high_s8(vb6), vget_high_s8(va2));
+      vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+      vacc1x6 = vpadalq_s16(vacc1x6, vprod1x6);
+      vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6);
+      int16x8_t vprod0x7 = vmull_s8(vget_low_s8(vb7), vget_low_s8(va0));
+      int16x8_t vprod1x7 = vmull_s8(vget_low_s8(vb7), vget_low_s8(va1));
+      int16x8_t vprod2x7 = vmull_s8(vget_low_s8(vb7), vget_low_s8(va2));
+      vprod0x7 = vmlal_s8(vprod0x7, vget_high_s8(vb7), vget_high_s8(va0));
+      vprod1x7 = vmlal_s8(vprod1x7, vget_high_s8(vb7), vget_high_s8(va1));
+      vprod2x7 = vmlal_s8(vprod2x7, vget_high_s8(vb7), vget_high_s8(va2));
+      vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+      vacc1x7 = vpadalq_s16(vacc1x7, vprod1x7);
+      vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7);
+
+      k += 16 * sizeof(int8_t);
+    }
+
+#if XNN_ARCH_ARM64
+    const int32x4_t vsum0x01 = vpaddq_s32(vacc0x0, vacc0x1);
+    const int32x4_t vsum0x23 = vpaddq_s32(vacc0x2, vacc0x3);
+    const int32x4_t vsum0x45 = vpaddq_s32(vacc0x4, vacc0x5);
+    const int32x4_t vsum0x67 = vpaddq_s32(vacc0x6, vacc0x7);
+    const int32x4_t vsum1x01 = vpaddq_s32(vacc1x0, vacc1x1);
+    const int32x4_t vsum1x23 = vpaddq_s32(vacc1x2, vacc1x3);
+    const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5);
+    const int32x4_t vsum1x67 = vpaddq_s32(vacc1x6, vacc1x7);
+    const int32x4_t vsum2x01 = vpaddq_s32(vacc2x0, vacc2x1);
+    const int32x4_t vsum2x23 = vpaddq_s32(vacc2x2, vacc2x3);
+    const int32x4_t vsum2x45 = vpaddq_s32(vacc2x4, vacc2x5);
+    const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7);
+    int32x4_t vacc0x0123 = vpaddq_s32(vsum0x01, vsum0x23);
+    int32x4_t vacc0x4567 = vpaddq_s32(vsum0x45, vsum0x67);
+    int32x4_t vacc1x0123 = vpaddq_s32(vsum1x01, vsum1x23);
+    int32x4_t vacc1x4567 = vpaddq_s32(vsum1x45, vsum1x67);
+    int32x4_t vacc2x0123 = vpaddq_s32(vsum2x01, vsum2x23);
+    int32x4_t vacc2x4567 = vpaddq_s32(vsum2x45, vsum2x67);
+#else
+    const int32x2_t vpsum0x0 = vadd_s32(vget_low_s32(vacc0x0), vget_high_s32(vacc0x0));
+    const int32x2_t vpsum0x1 = vadd_s32(vget_low_s32(vacc0x1), vget_high_s32(vacc0x1));
+    const int32x2_t vpsum0x2 = vadd_s32(vget_low_s32(vacc0x2), vget_high_s32(vacc0x2));
+    const int32x2_t vpsum0x3 = vadd_s32(vget_low_s32(vacc0x3), vget_high_s32(vacc0x3));
+    const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1);
+    const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3);
+    int32x4_t vacc0x0123 = vcombine_s32(vsum0x01, vsum0x23 );
+    const int32x2_t vpsum0x4 = vadd_s32(vget_low_s32(vacc0x4), vget_high_s32(vacc0x4));
+    const int32x2_t vpsum0x5 = vadd_s32(vget_low_s32(vacc0x5), vget_high_s32(vacc0x5));
+    const int32x2_t vpsum0x6 = vadd_s32(vget_low_s32(vacc0x6), vget_high_s32(vacc0x6));
+    const int32x2_t vpsum0x7 = vadd_s32(vget_low_s32(vacc0x7), vget_high_s32(vacc0x7));
+    const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5);
+    const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7);
+    int32x4_t vacc0x4567 = vcombine_s32(vsum0x45, vsum0x67 );
+    const int32x2_t vpsum1x0 = vadd_s32(vget_low_s32(vacc1x0), vget_high_s32(vacc1x0));
+    const int32x2_t vpsum1x1 = vadd_s32(vget_low_s32(vacc1x1), vget_high_s32(vacc1x1));
+    const int32x2_t vpsum1x2 = vadd_s32(vget_low_s32(vacc1x2), vget_high_s32(vacc1x2));
+    const int32x2_t vpsum1x3 = vadd_s32(vget_low_s32(vacc1x3), vget_high_s32(vacc1x3));
+    const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1);
+    const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3);
+    int32x4_t vacc1x0123 = vcombine_s32(vsum1x01, vsum1x23 );
+    const int32x2_t vpsum1x4 = vadd_s32(vget_low_s32(vacc1x4), vget_high_s32(vacc1x4));
+    const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5));
+    const int32x2_t vpsum1x6 = vadd_s32(vget_low_s32(vacc1x6), vget_high_s32(vacc1x6));
+    const int32x2_t vpsum1x7 = vadd_s32(vget_low_s32(vacc1x7), vget_high_s32(vacc1x7));
+    const int32x2_t vsum1x45 = vpadd_s32(vpsum1x4, vpsum1x5);
+    const int32x2_t vsum1x67 = vpadd_s32(vpsum1x6, vpsum1x7);
+    int32x4_t vacc1x4567 = vcombine_s32(vsum1x45, vsum1x67 );
+    const int32x2_t vpsum2x0 = vadd_s32(vget_low_s32(vacc2x0), vget_high_s32(vacc2x0));
+    const int32x2_t vpsum2x1 = vadd_s32(vget_low_s32(vacc2x1), vget_high_s32(vacc2x1));
+    const int32x2_t vpsum2x2 = vadd_s32(vget_low_s32(vacc2x2), vget_high_s32(vacc2x2));
+    const int32x2_t vpsum2x3 = vadd_s32(vget_low_s32(vacc2x3), vget_high_s32(vacc2x3));
+    const int32x2_t vsum2x01 = vpadd_s32(vpsum2x0, vpsum2x1);
+    const int32x2_t vsum2x23 = vpadd_s32(vpsum2x2, vpsum2x3);
+    int32x4_t vacc2x0123 = vcombine_s32(vsum2x01, vsum2x23 );
+    const int32x2_t vpsum2x4 = vadd_s32(vget_low_s32(vacc2x4), vget_high_s32(vacc2x4));
+    const int32x2_t vpsum2x5 = vadd_s32(vget_low_s32(vacc2x5), vget_high_s32(vacc2x5));
+    const int32x2_t vpsum2x6 = vadd_s32(vget_low_s32(vacc2x6), vget_high_s32(vacc2x6));
+    const int32x2_t vpsum2x7 = vadd_s32(vget_low_s32(vacc2x7), vget_high_s32(vacc2x7));
+    const int32x2_t vsum2x45 = vpadd_s32(vpsum2x4, vpsum2x5);
+    const int32x2_t vsum2x67 = vpadd_s32(vpsum2x6, vpsum2x7);
+    int32x4_t vacc2x4567 = vcombine_s32(vsum2x45, vsum2x67 );
+#endif
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+    int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
+    int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
+    int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567);
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);
+    vout2x01234567 = vmax_s8(vout2x01234567, vget_low_s8(voutput_min));
+
+    vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);
+    vout2x01234567 = vmin_s8(vout2x01234567, vget_low_s8(voutput_max));
+
+    if (nc >= 8) {
+      vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
+      vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
+      vst1_s8(c2 + 0, vout2x01234567);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1_lane_s8(c2, vout2x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/3x8c2-minmax-neon-mlal-padal-dup.c b/src/qs8-gemm/gen/3x8c2-minmax-neon-mlal-padal-dup.c
new file mode 100644
index 0000000..4d90e3a
--- /dev/null
+++ b/src/qs8-gemm/gen/3x8c2-minmax-neon-mlal-padal-dup.c
@@ -0,0 +1,376 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/c2-neon-mull-padal-dup.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc1x0123 = vacc0x0123;
+    int32x4_t vacc1x4567 = vacc0x4567;
+    int32x4_t vacc2x0123 = vacc0x0123;
+    int32x4_t vacc2x4567 = vacc0x4567;
+
+    size_t k = kc;
+
+    while (k >= 16 * sizeof(int8_t)) {
+      const int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va0x1 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va1x0 = vld1_s8(a1); a1 += 8;
+      const int8x8_t va1x1 = vld1_s8(a1); a1 += 8;
+      const int8x8_t va2x0 = vld1_s8(a2); a2 += 8;
+      const int8x8_t va2x1 = vld1_s8(a2); a2 += 8;
+
+      const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+      int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+      int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0)));
+      const int8x8_t vb0123c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+      vprod1x0123c0 = vmlal_s8(vprod1x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+      vprod2x0123c0 = vmlal_s8(vprod2x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 0)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+      int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+      int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+      int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0)));
+      const int8x8_t vb4567c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+      vprod1x4567c0 = vmlal_s8(vprod1x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+      vprod2x4567c0 = vmlal_s8(vprod2x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 0)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+      int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+      int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+      int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1)));
+      const int8x8_t vb0123c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x0123c1 = vmlal_s8(vprod0x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+      vprod1x0123c1 = vmlal_s8(vprod1x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+      vprod2x0123c1 = vmlal_s8(vprod2x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 1)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+      int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+      int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+      int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1)));
+      const int8x8_t vb4567c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x4567c1 = vmlal_s8(vprod0x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+      vprod1x4567c1 = vmlal_s8(vprod1x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+      vprod2x4567c1 = vmlal_s8(vprod2x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 1)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+      int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+      int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+      int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 2)));
+      const int8x8_t vb0123c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x0123c2 = vmlal_s8(vprod0x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+      vprod1x0123c2 = vmlal_s8(vprod1x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+      vprod2x0123c2 = vmlal_s8(vprod2x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 2)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+      int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+      int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+      int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 2)));
+      const int8x8_t vb4567c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x4567c2 = vmlal_s8(vprod0x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+      vprod1x4567c2 = vmlal_s8(vprod1x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+      vprod2x4567c2 = vmlal_s8(vprod2x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 2)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+      int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+      int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+      int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 3)));
+      const int8x8_t vb0123c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x0123c3 = vmlal_s8(vprod0x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+      vprod1x0123c3 = vmlal_s8(vprod1x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+      vprod2x0123c3 = vmlal_s8(vprod2x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 3)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
+      int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+      int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+      int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 3)));
+      const int8x8_t vb4567c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x4567c3 = vmlal_s8(vprod0x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+      vprod1x4567c3 = vmlal_s8(vprod1x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+      vprod2x4567c3 = vmlal_s8(vprod2x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 3)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
+
+      k -= 16 * sizeof(int8_t);
+    }
+
+    if (k >= 8 * sizeof(int8_t)) {
+      const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+      const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+
+      const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+      const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+      const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+      const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+      const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+      const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+      const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+      const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+      const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+      const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+      const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+      const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+      const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+      const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+      const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+      const int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
+      const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+      const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+      const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+      const int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
+
+      k -= 8 * sizeof(int8_t);
+    }
+
+    if XNN_UNLIKELY(k != 0) {
+      const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+      const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+      const int8x8_t va2 = vld1_s8(a2); a2 = (const int8_t*) ((uintptr_t) a2 + k);
+
+      const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+      const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+      const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+      const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+      const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+      const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+
+      if (k > 2 * sizeof(int8_t)) {
+        const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+        const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+        const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+        const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+        const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+        const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+
+        if (k > 4 * sizeof(int8_t)) {
+          const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+          vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+          const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+          vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+          const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+          vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+          const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+          vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+          const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+          vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+          const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+          vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+        }
+      }
+    }
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
+    int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
+    int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567);
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);
+    vout2x01234567 = vmax_s8(vout2x01234567, vget_low_s8(voutput_min));
+
+    vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);
+    vout2x01234567 = vmin_s8(vout2x01234567, vget_low_s8(voutput_max));
+
+    if (nc >= 8) {
+      vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
+      vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
+      vst1_s8(c2 + 0, vout2x01234567);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1_lane_s8(c2, vout2x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/3x8c2-minmax-neon-mull-padal-dup.c b/src/qs8-gemm/gen/3x8c2-minmax-neon-mull-padal-dup.c
new file mode 100644
index 0000000..cdd6df2
--- /dev/null
+++ b/src/qs8-gemm/gen/3x8c2-minmax-neon-mull-padal-dup.c
@@ -0,0 +1,276 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/c2-neon-mull-padal-dup.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc1x0123 = vacc0x0123;
+    int32x4_t vacc1x4567 = vacc0x4567;
+    int32x4_t vacc2x0123 = vacc0x0123;
+    int32x4_t vacc2x4567 = vacc0x4567;
+
+    size_t k = kc;
+
+
+    while (k >= 8 * sizeof(int8_t)) {
+      const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+      const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+
+      const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+      const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+      const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+      const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+      const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+      const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+      const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+      const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+      const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+      const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+      const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+      const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+      const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+      const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+      const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+      const int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
+      const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+      const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+      const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+      const int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
+
+      k -= 8 * sizeof(int8_t);
+    }
+
+    if XNN_UNLIKELY(k != 0) {
+      const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+      const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+      const int8x8_t va2 = vld1_s8(a2); a2 = (const int8_t*) ((uintptr_t) a2 + k);
+
+      const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+      const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+      const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+      const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+      const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+      const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+
+      if (k > 2 * sizeof(int8_t)) {
+        const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+        const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+        const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+        const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+        const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+        const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+
+        if (k > 4 * sizeof(int8_t)) {
+          const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+          vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+          const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+          vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+          const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+          vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+          const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+          vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+          const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+          vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+          const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+          vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+        }
+      }
+    }
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
+    int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
+    int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567);
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);
+    vout2x01234567 = vmax_s8(vout2x01234567, vget_low_s8(voutput_min));
+
+    vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);
+    vout2x01234567 = vmin_s8(vout2x01234567, vget_low_s8(voutput_max));
+
+    if (nc >= 8) {
+      vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
+      vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
+      vst1_s8(c2 + 0, vout2x01234567);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1_lane_s8(c2, vout2x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/3x8c8-minmax-avx2.c b/src/qs8-gemm/gen/3x8c8-minmax-avx2.c
index f4d3c06..d703f98 100644
--- a/src/qs8-gemm/gen/3x8c8-minmax-avx2.c
+++ b/src/qs8-gemm/gen/3x8c8-minmax-avx2.c
@@ -13,6 +13,7 @@
 
 #include <xnnpack/gemm.h>
 #include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2(
@@ -36,6 +37,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -195,14 +197,14 @@
       _mm_storel_epi64((__m128i*) c1, vout_hi);
       _mm_storeh_pi((__m64*) c2, _mm_castsi128_ps(vout_lo));
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-      a2 = (const int8_t*) ((uintptr_t) a2 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
       nc -= 8;
     } else {
       if (nc & 4) {
diff --git a/src/qs8-gemm/gen/3x8c8-minmax-neon-mlal-padal.c b/src/qs8-gemm/gen/3x8c8-minmax-neon-mlal-padal.c
new file mode 100644
index 0000000..5671d6f
--- /dev/null
+++ b/src/qs8-gemm/gen/3x8c8-minmax-neon-mlal-padal.c
@@ -0,0 +1,400 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/c8-neon-mull-padal.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+
+  do {
+    int32x4_t vacc0x0 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x1 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x2 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x3 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x4 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x5 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc1x0 = vacc0x0;
+    int32x4_t vacc1x1 = vacc0x1;
+    int32x4_t vacc1x2 = vacc0x2;
+    int32x4_t vacc1x3 = vacc0x3;
+    int32x4_t vacc1x4 = vacc0x4;
+    int32x4_t vacc1x5 = vacc0x5;
+    int32x4_t vacc1x6 = vacc0x6;
+    int32x4_t vacc1x7 = vacc0x7;
+    int32x4_t vacc2x0 = vacc0x0;
+    int32x4_t vacc2x1 = vacc0x1;
+    int32x4_t vacc2x2 = vacc0x2;
+    int32x4_t vacc2x3 = vacc0x3;
+    int32x4_t vacc2x4 = vacc0x4;
+    int32x4_t vacc2x5 = vacc0x5;
+    int32x4_t vacc2x6 = vacc0x6;
+    int32x4_t vacc2x7 = vacc0x7;
+
+    size_t k = kc;
+    // 2x partial unrolled loop to load 16 bytes at a time using MLA.
+    while (k >= 16 * sizeof(int8_t)) {
+      const int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va0x1 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va1x0 = vld1_s8(a1); a1 += 8;
+      const int8x8_t va1x1 = vld1_s8(a1); a1 += 8;
+      const int8x8_t va2x0 = vld1_s8(a2); a2 += 8;
+      const int8x8_t va2x1 = vld1_s8(a2); a2 += 8;
+
+      const int8x8_t vb0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb4x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb5x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb6x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb7x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+
+      const int8x8_t vb0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x0 = vmull_s8(vb0x0, va0x0);
+      int16x8_t vprod1x0 = vmull_s8(vb0x0, va1x0);
+      int16x8_t vprod2x0 = vmull_s8(vb0x0, va2x0);
+      vprod0x0 = vmlal_s8(vprod0x0, vb0x1, va0x1);
+      vprod1x0 = vmlal_s8(vprod1x0, vb0x1, va1x1);
+      vprod2x0 = vmlal_s8(vprod2x0, vb0x1, va2x1);
+      vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+      vacc1x0 = vpadalq_s16(vacc1x0, vprod1x0);
+      vacc2x0 = vpadalq_s16(vacc2x0, vprod2x0);
+      const int8x8_t vb1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x1 = vmull_s8(vb1x0, va0x0);
+      int16x8_t vprod1x1 = vmull_s8(vb1x0, va1x0);
+      int16x8_t vprod2x1 = vmull_s8(vb1x0, va2x0);
+      vprod0x1 = vmlal_s8(vprod0x1, vb1x1, va0x1);
+      vprod1x1 = vmlal_s8(vprod1x1, vb1x1, va1x1);
+      vprod2x1 = vmlal_s8(vprod2x1, vb1x1, va2x1);
+      vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+      vacc1x1 = vpadalq_s16(vacc1x1, vprod1x1);
+      vacc2x1 = vpadalq_s16(vacc2x1, vprod2x1);
+      const int8x8_t vb2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x2 = vmull_s8(vb2x0, va0x0);
+      int16x8_t vprod1x2 = vmull_s8(vb2x0, va1x0);
+      int16x8_t vprod2x2 = vmull_s8(vb2x0, va2x0);
+      vprod0x2 = vmlal_s8(vprod0x2, vb2x1, va0x1);
+      vprod1x2 = vmlal_s8(vprod1x2, vb2x1, va1x1);
+      vprod2x2 = vmlal_s8(vprod2x2, vb2x1, va2x1);
+      vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+      vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2);
+      vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2);
+      const int8x8_t vb3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x3 = vmull_s8(vb3x0, va0x0);
+      int16x8_t vprod1x3 = vmull_s8(vb3x0, va1x0);
+      int16x8_t vprod2x3 = vmull_s8(vb3x0, va2x0);
+      vprod0x3 = vmlal_s8(vprod0x3, vb3x1, va0x1);
+      vprod1x3 = vmlal_s8(vprod1x3, vb3x1, va1x1);
+      vprod2x3 = vmlal_s8(vprod2x3, vb3x1, va2x1);
+      vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+      vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3);
+      vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3);
+      const int8x8_t vb4x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x4 = vmull_s8(vb4x0, va0x0);
+      int16x8_t vprod1x4 = vmull_s8(vb4x0, va1x0);
+      int16x8_t vprod2x4 = vmull_s8(vb4x0, va2x0);
+      vprod0x4 = vmlal_s8(vprod0x4, vb4x1, va0x1);
+      vprod1x4 = vmlal_s8(vprod1x4, vb4x1, va1x1);
+      vprod2x4 = vmlal_s8(vprod2x4, vb4x1, va2x1);
+      vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+      vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4);
+      vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4);
+      const int8x8_t vb5x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x5 = vmull_s8(vb5x0, va0x0);
+      int16x8_t vprod1x5 = vmull_s8(vb5x0, va1x0);
+      int16x8_t vprod2x5 = vmull_s8(vb5x0, va2x0);
+      vprod0x5 = vmlal_s8(vprod0x5, vb5x1, va0x1);
+      vprod1x5 = vmlal_s8(vprod1x5, vb5x1, va1x1);
+      vprod2x5 = vmlal_s8(vprod2x5, vb5x1, va2x1);
+      vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+      vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5);
+      vacc2x5 = vpadalq_s16(vacc2x5, vprod2x5);
+      const int8x8_t vb6x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x6 = vmull_s8(vb6x0, va0x0);
+      int16x8_t vprod1x6 = vmull_s8(vb6x0, va1x0);
+      int16x8_t vprod2x6 = vmull_s8(vb6x0, va2x0);
+      vprod0x6 = vmlal_s8(vprod0x6, vb6x1, va0x1);
+      vprod1x6 = vmlal_s8(vprod1x6, vb6x1, va1x1);
+      vprod2x6 = vmlal_s8(vprod2x6, vb6x1, va2x1);
+      vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+      vacc1x6 = vpadalq_s16(vacc1x6, vprod1x6);
+      vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6);
+      const int8x8_t vb7x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x7 = vmull_s8(vb7x0, va0x0);
+      int16x8_t vprod1x7 = vmull_s8(vb7x0, va1x0);
+      int16x8_t vprod2x7 = vmull_s8(vb7x0, va2x0);
+      vprod0x7 = vmlal_s8(vprod0x7, vb7x1, va0x1);
+      vprod1x7 = vmlal_s8(vprod1x7, vb7x1, va1x1);
+      vprod2x7 = vmlal_s8(vprod2x7, vb7x1, va2x1);
+      vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+      vacc1x7 = vpadalq_s16(vacc1x7, vprod1x7);
+      vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7);
+
+      k -= 16 * sizeof(int8_t);
+    }
+
+    // Handle 8 bytes at a time using MUL.
+    if (k > 0) {
+      const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+      const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+
+      const int8x8_t vb0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x0 = vmull_s8(vb0, va0);
+      const int16x8_t vprod1x0 = vmull_s8(vb0, va1);
+      const int16x8_t vprod2x0 = vmull_s8(vb0, va2);
+      vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+      vacc1x0 = vpadalq_s16(vacc1x0, vprod1x0);
+      vacc2x0 = vpadalq_s16(vacc2x0, vprod2x0);
+      const int8x8_t vb1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x1 = vmull_s8(vb1, va0);
+      const int16x8_t vprod1x1 = vmull_s8(vb1, va1);
+      const int16x8_t vprod2x1 = vmull_s8(vb1, va2);
+      vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+      vacc1x1 = vpadalq_s16(vacc1x1, vprod1x1);
+      vacc2x1 = vpadalq_s16(vacc2x1, vprod2x1);
+      const int8x8_t vb2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x2 = vmull_s8(vb2, va0);
+      const int16x8_t vprod1x2 = vmull_s8(vb2, va1);
+      const int16x8_t vprod2x2 = vmull_s8(vb2, va2);
+      vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+      vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2);
+      vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2);
+      const int8x8_t vb3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x3 = vmull_s8(vb3, va0);
+      const int16x8_t vprod1x3 = vmull_s8(vb3, va1);
+      const int16x8_t vprod2x3 = vmull_s8(vb3, va2);
+      vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+      vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3);
+      vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3);
+      const int8x8_t vb4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x4 = vmull_s8(vb4, va0);
+      const int16x8_t vprod1x4 = vmull_s8(vb4, va1);
+      const int16x8_t vprod2x4 = vmull_s8(vb4, va2);
+      vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+      vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4);
+      vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4);
+      const int8x8_t vb5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x5 = vmull_s8(vb5, va0);
+      const int16x8_t vprod1x5 = vmull_s8(vb5, va1);
+      const int16x8_t vprod2x5 = vmull_s8(vb5, va2);
+      vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+      vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5);
+      vacc2x5 = vpadalq_s16(vacc2x5, vprod2x5);
+      const int8x8_t vb6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x6 = vmull_s8(vb6, va0);
+      const int16x8_t vprod1x6 = vmull_s8(vb6, va1);
+      const int16x8_t vprod2x6 = vmull_s8(vb6, va2);
+      vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+      vacc1x6 = vpadalq_s16(vacc1x6, vprod1x6);
+      vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6);
+      const int8x8_t vb7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x7 = vmull_s8(vb7, va0);
+      const int16x8_t vprod1x7 = vmull_s8(vb7, va1);
+      const int16x8_t vprod2x7 = vmull_s8(vb7, va2);
+      vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+      vacc1x7 = vpadalq_s16(vacc1x7, vprod1x7);
+      vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7);
+
+      k -= 8 * sizeof(int8_t);
+    }
+
+#if XNN_ARCH_ARM64
+    const int32x4_t vsum0x01 = vpaddq_s32(vacc0x0, vacc0x1);
+    const int32x4_t vsum0x23 = vpaddq_s32(vacc0x2, vacc0x3);
+    const int32x4_t vsum0x45 = vpaddq_s32(vacc0x4, vacc0x5);
+    const int32x4_t vsum0x67 = vpaddq_s32(vacc0x6, vacc0x7);
+    const int32x4_t vsum1x01 = vpaddq_s32(vacc1x0, vacc1x1);
+    const int32x4_t vsum1x23 = vpaddq_s32(vacc1x2, vacc1x3);
+    const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5);
+    const int32x4_t vsum1x67 = vpaddq_s32(vacc1x6, vacc1x7);
+    const int32x4_t vsum2x01 = vpaddq_s32(vacc2x0, vacc2x1);
+    const int32x4_t vsum2x23 = vpaddq_s32(vacc2x2, vacc2x3);
+    const int32x4_t vsum2x45 = vpaddq_s32(vacc2x4, vacc2x5);
+    const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7);
+    int32x4_t vacc0x0123 = vpaddq_s32(vsum0x01, vsum0x23);
+    int32x4_t vacc0x4567 = vpaddq_s32(vsum0x45, vsum0x67);
+    int32x4_t vacc1x0123 = vpaddq_s32(vsum1x01, vsum1x23);
+    int32x4_t vacc1x4567 = vpaddq_s32(vsum1x45, vsum1x67);
+    int32x4_t vacc2x0123 = vpaddq_s32(vsum2x01, vsum2x23);
+    int32x4_t vacc2x4567 = vpaddq_s32(vsum2x45, vsum2x67);
+#else
+    const int32x2_t vpsum0x0 = vadd_s32(vget_low_s32(vacc0x0), vget_high_s32(vacc0x0));
+    const int32x2_t vpsum0x1 = vadd_s32(vget_low_s32(vacc0x1), vget_high_s32(vacc0x1));
+    const int32x2_t vpsum0x2 = vadd_s32(vget_low_s32(vacc0x2), vget_high_s32(vacc0x2));
+    const int32x2_t vpsum0x3 = vadd_s32(vget_low_s32(vacc0x3), vget_high_s32(vacc0x3));
+    const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1);
+    const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3);
+    int32x4_t vacc0x0123 = vcombine_s32(vsum0x01, vsum0x23 );
+    const int32x2_t vpsum0x4 = vadd_s32(vget_low_s32(vacc0x4), vget_high_s32(vacc0x4));
+    const int32x2_t vpsum0x5 = vadd_s32(vget_low_s32(vacc0x5), vget_high_s32(vacc0x5));
+    const int32x2_t vpsum0x6 = vadd_s32(vget_low_s32(vacc0x6), vget_high_s32(vacc0x6));
+    const int32x2_t vpsum0x7 = vadd_s32(vget_low_s32(vacc0x7), vget_high_s32(vacc0x7));
+    const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5);
+    const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7);
+    int32x4_t vacc0x4567 = vcombine_s32(vsum0x45, vsum0x67 );
+    const int32x2_t vpsum1x0 = vadd_s32(vget_low_s32(vacc1x0), vget_high_s32(vacc1x0));
+    const int32x2_t vpsum1x1 = vadd_s32(vget_low_s32(vacc1x1), vget_high_s32(vacc1x1));
+    const int32x2_t vpsum1x2 = vadd_s32(vget_low_s32(vacc1x2), vget_high_s32(vacc1x2));
+    const int32x2_t vpsum1x3 = vadd_s32(vget_low_s32(vacc1x3), vget_high_s32(vacc1x3));
+    const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1);
+    const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3);
+    int32x4_t vacc1x0123 = vcombine_s32(vsum1x01, vsum1x23 );
+    const int32x2_t vpsum1x4 = vadd_s32(vget_low_s32(vacc1x4), vget_high_s32(vacc1x4));
+    const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5));
+    const int32x2_t vpsum1x6 = vadd_s32(vget_low_s32(vacc1x6), vget_high_s32(vacc1x6));
+    const int32x2_t vpsum1x7 = vadd_s32(vget_low_s32(vacc1x7), vget_high_s32(vacc1x7));
+    const int32x2_t vsum1x45 = vpadd_s32(vpsum1x4, vpsum1x5);
+    const int32x2_t vsum1x67 = vpadd_s32(vpsum1x6, vpsum1x7);
+    int32x4_t vacc1x4567 = vcombine_s32(vsum1x45, vsum1x67 );
+    const int32x2_t vpsum2x0 = vadd_s32(vget_low_s32(vacc2x0), vget_high_s32(vacc2x0));
+    const int32x2_t vpsum2x1 = vadd_s32(vget_low_s32(vacc2x1), vget_high_s32(vacc2x1));
+    const int32x2_t vpsum2x2 = vadd_s32(vget_low_s32(vacc2x2), vget_high_s32(vacc2x2));
+    const int32x2_t vpsum2x3 = vadd_s32(vget_low_s32(vacc2x3), vget_high_s32(vacc2x3));
+    const int32x2_t vsum2x01 = vpadd_s32(vpsum2x0, vpsum2x1);
+    const int32x2_t vsum2x23 = vpadd_s32(vpsum2x2, vpsum2x3);
+    int32x4_t vacc2x0123 = vcombine_s32(vsum2x01, vsum2x23 );
+    const int32x2_t vpsum2x4 = vadd_s32(vget_low_s32(vacc2x4), vget_high_s32(vacc2x4));
+    const int32x2_t vpsum2x5 = vadd_s32(vget_low_s32(vacc2x5), vget_high_s32(vacc2x5));
+    const int32x2_t vpsum2x6 = vadd_s32(vget_low_s32(vacc2x6), vget_high_s32(vacc2x6));
+    const int32x2_t vpsum2x7 = vadd_s32(vget_low_s32(vacc2x7), vget_high_s32(vacc2x7));
+    const int32x2_t vsum2x45 = vpadd_s32(vpsum2x4, vpsum2x5);
+    const int32x2_t vsum2x67 = vpadd_s32(vpsum2x6, vpsum2x7);
+    int32x4_t vacc2x4567 = vcombine_s32(vsum2x45, vsum2x67 );
+#endif
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+    int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
+    int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
+    int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567);
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);
+    vout2x01234567 = vmax_s8(vout2x01234567, vget_low_s8(voutput_min));
+
+    vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);
+    vout2x01234567 = vmin_s8(vout2x01234567, vget_low_s8(voutput_max));
+
+    if (nc >= 8) {
+      vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
+      vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
+      vst1_s8(c2 + 0, vout2x01234567);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1_lane_s8(c2, vout2x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/3x8c8-minmax-neon-mull-padal.c b/src/qs8-gemm/gen/3x8c8-minmax-neon-mull-padal.c
new file mode 100644
index 0000000..358654f
--- /dev/null
+++ b/src/qs8-gemm/gen/3x8c8-minmax-neon-mull-padal.c
@@ -0,0 +1,299 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/c8-neon-mull-padal.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+
+  do {
+    int32x4_t vacc0x0 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x1 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x2 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x3 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x4 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x5 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc1x0 = vacc0x0;
+    int32x4_t vacc1x1 = vacc0x1;
+    int32x4_t vacc1x2 = vacc0x2;
+    int32x4_t vacc1x3 = vacc0x3;
+    int32x4_t vacc1x4 = vacc0x4;
+    int32x4_t vacc1x5 = vacc0x5;
+    int32x4_t vacc1x6 = vacc0x6;
+    int32x4_t vacc1x7 = vacc0x7;
+    int32x4_t vacc2x0 = vacc0x0;
+    int32x4_t vacc2x1 = vacc0x1;
+    int32x4_t vacc2x2 = vacc0x2;
+    int32x4_t vacc2x3 = vacc0x3;
+    int32x4_t vacc2x4 = vacc0x4;
+    int32x4_t vacc2x5 = vacc0x5;
+    int32x4_t vacc2x6 = vacc0x6;
+    int32x4_t vacc2x7 = vacc0x7;
+
+    size_t k = kc;
+
+    // Handle 8 bytes at a time using MUL.
+    while (k > 0) {
+      const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+      const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+
+      const int8x8_t vb0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x0 = vmull_s8(vb0, va0);
+      const int16x8_t vprod1x0 = vmull_s8(vb0, va1);
+      const int16x8_t vprod2x0 = vmull_s8(vb0, va2);
+      vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+      vacc1x0 = vpadalq_s16(vacc1x0, vprod1x0);
+      vacc2x0 = vpadalq_s16(vacc2x0, vprod2x0);
+      const int8x8_t vb1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x1 = vmull_s8(vb1, va0);
+      const int16x8_t vprod1x1 = vmull_s8(vb1, va1);
+      const int16x8_t vprod2x1 = vmull_s8(vb1, va2);
+      vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+      vacc1x1 = vpadalq_s16(vacc1x1, vprod1x1);
+      vacc2x1 = vpadalq_s16(vacc2x1, vprod2x1);
+      const int8x8_t vb2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x2 = vmull_s8(vb2, va0);
+      const int16x8_t vprod1x2 = vmull_s8(vb2, va1);
+      const int16x8_t vprod2x2 = vmull_s8(vb2, va2);
+      vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+      vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2);
+      vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2);
+      const int8x8_t vb3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x3 = vmull_s8(vb3, va0);
+      const int16x8_t vprod1x3 = vmull_s8(vb3, va1);
+      const int16x8_t vprod2x3 = vmull_s8(vb3, va2);
+      vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+      vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3);
+      vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3);
+      const int8x8_t vb4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x4 = vmull_s8(vb4, va0);
+      const int16x8_t vprod1x4 = vmull_s8(vb4, va1);
+      const int16x8_t vprod2x4 = vmull_s8(vb4, va2);
+      vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+      vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4);
+      vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4);
+      const int8x8_t vb5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x5 = vmull_s8(vb5, va0);
+      const int16x8_t vprod1x5 = vmull_s8(vb5, va1);
+      const int16x8_t vprod2x5 = vmull_s8(vb5, va2);
+      vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+      vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5);
+      vacc2x5 = vpadalq_s16(vacc2x5, vprod2x5);
+      const int8x8_t vb6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x6 = vmull_s8(vb6, va0);
+      const int16x8_t vprod1x6 = vmull_s8(vb6, va1);
+      const int16x8_t vprod2x6 = vmull_s8(vb6, va2);
+      vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+      vacc1x6 = vpadalq_s16(vacc1x6, vprod1x6);
+      vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6);
+      const int8x8_t vb7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x7 = vmull_s8(vb7, va0);
+      const int16x8_t vprod1x7 = vmull_s8(vb7, va1);
+      const int16x8_t vprod2x7 = vmull_s8(vb7, va2);
+      vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+      vacc1x7 = vpadalq_s16(vacc1x7, vprod1x7);
+      vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7);
+
+      k -= 8 * sizeof(int8_t);
+    }
+
+#if XNN_ARCH_ARM64
+    const int32x4_t vsum0x01 = vpaddq_s32(vacc0x0, vacc0x1);
+    const int32x4_t vsum0x23 = vpaddq_s32(vacc0x2, vacc0x3);
+    const int32x4_t vsum0x45 = vpaddq_s32(vacc0x4, vacc0x5);
+    const int32x4_t vsum0x67 = vpaddq_s32(vacc0x6, vacc0x7);
+    const int32x4_t vsum1x01 = vpaddq_s32(vacc1x0, vacc1x1);
+    const int32x4_t vsum1x23 = vpaddq_s32(vacc1x2, vacc1x3);
+    const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5);
+    const int32x4_t vsum1x67 = vpaddq_s32(vacc1x6, vacc1x7);
+    const int32x4_t vsum2x01 = vpaddq_s32(vacc2x0, vacc2x1);
+    const int32x4_t vsum2x23 = vpaddq_s32(vacc2x2, vacc2x3);
+    const int32x4_t vsum2x45 = vpaddq_s32(vacc2x4, vacc2x5);
+    const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7);
+    int32x4_t vacc0x0123 = vpaddq_s32(vsum0x01, vsum0x23);
+    int32x4_t vacc0x4567 = vpaddq_s32(vsum0x45, vsum0x67);
+    int32x4_t vacc1x0123 = vpaddq_s32(vsum1x01, vsum1x23);
+    int32x4_t vacc1x4567 = vpaddq_s32(vsum1x45, vsum1x67);
+    int32x4_t vacc2x0123 = vpaddq_s32(vsum2x01, vsum2x23);
+    int32x4_t vacc2x4567 = vpaddq_s32(vsum2x45, vsum2x67);
+#else
+    const int32x2_t vpsum0x0 = vadd_s32(vget_low_s32(vacc0x0), vget_high_s32(vacc0x0));
+    const int32x2_t vpsum0x1 = vadd_s32(vget_low_s32(vacc0x1), vget_high_s32(vacc0x1));
+    const int32x2_t vpsum0x2 = vadd_s32(vget_low_s32(vacc0x2), vget_high_s32(vacc0x2));
+    const int32x2_t vpsum0x3 = vadd_s32(vget_low_s32(vacc0x3), vget_high_s32(vacc0x3));
+    const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1);
+    const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3);
+    int32x4_t vacc0x0123 = vcombine_s32(vsum0x01, vsum0x23 );
+    const int32x2_t vpsum0x4 = vadd_s32(vget_low_s32(vacc0x4), vget_high_s32(vacc0x4));
+    const int32x2_t vpsum0x5 = vadd_s32(vget_low_s32(vacc0x5), vget_high_s32(vacc0x5));
+    const int32x2_t vpsum0x6 = vadd_s32(vget_low_s32(vacc0x6), vget_high_s32(vacc0x6));
+    const int32x2_t vpsum0x7 = vadd_s32(vget_low_s32(vacc0x7), vget_high_s32(vacc0x7));
+    const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5);
+    const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7);
+    int32x4_t vacc0x4567 = vcombine_s32(vsum0x45, vsum0x67 );
+    const int32x2_t vpsum1x0 = vadd_s32(vget_low_s32(vacc1x0), vget_high_s32(vacc1x0));
+    const int32x2_t vpsum1x1 = vadd_s32(vget_low_s32(vacc1x1), vget_high_s32(vacc1x1));
+    const int32x2_t vpsum1x2 = vadd_s32(vget_low_s32(vacc1x2), vget_high_s32(vacc1x2));
+    const int32x2_t vpsum1x3 = vadd_s32(vget_low_s32(vacc1x3), vget_high_s32(vacc1x3));
+    const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1);
+    const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3);
+    int32x4_t vacc1x0123 = vcombine_s32(vsum1x01, vsum1x23 );
+    const int32x2_t vpsum1x4 = vadd_s32(vget_low_s32(vacc1x4), vget_high_s32(vacc1x4));
+    const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5));
+    const int32x2_t vpsum1x6 = vadd_s32(vget_low_s32(vacc1x6), vget_high_s32(vacc1x6));
+    const int32x2_t vpsum1x7 = vadd_s32(vget_low_s32(vacc1x7), vget_high_s32(vacc1x7));
+    const int32x2_t vsum1x45 = vpadd_s32(vpsum1x4, vpsum1x5);
+    const int32x2_t vsum1x67 = vpadd_s32(vpsum1x6, vpsum1x7);
+    int32x4_t vacc1x4567 = vcombine_s32(vsum1x45, vsum1x67 );
+    const int32x2_t vpsum2x0 = vadd_s32(vget_low_s32(vacc2x0), vget_high_s32(vacc2x0));
+    const int32x2_t vpsum2x1 = vadd_s32(vget_low_s32(vacc2x1), vget_high_s32(vacc2x1));
+    const int32x2_t vpsum2x2 = vadd_s32(vget_low_s32(vacc2x2), vget_high_s32(vacc2x2));
+    const int32x2_t vpsum2x3 = vadd_s32(vget_low_s32(vacc2x3), vget_high_s32(vacc2x3));
+    const int32x2_t vsum2x01 = vpadd_s32(vpsum2x0, vpsum2x1);
+    const int32x2_t vsum2x23 = vpadd_s32(vpsum2x2, vpsum2x3);
+    int32x4_t vacc2x0123 = vcombine_s32(vsum2x01, vsum2x23 );
+    const int32x2_t vpsum2x4 = vadd_s32(vget_low_s32(vacc2x4), vget_high_s32(vacc2x4));
+    const int32x2_t vpsum2x5 = vadd_s32(vget_low_s32(vacc2x5), vget_high_s32(vacc2x5));
+    const int32x2_t vpsum2x6 = vadd_s32(vget_low_s32(vacc2x6), vget_high_s32(vacc2x6));
+    const int32x2_t vpsum2x7 = vadd_s32(vget_low_s32(vacc2x7), vget_high_s32(vacc2x7));
+    const int32x2_t vsum2x45 = vpadd_s32(vpsum2x4, vpsum2x5);
+    const int32x2_t vsum2x67 = vpadd_s32(vpsum2x6, vpsum2x7);
+    int32x4_t vacc2x4567 = vcombine_s32(vsum2x45, vsum2x67 );
+#endif
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+    int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
+    int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
+    int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567);
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);
+    vout2x01234567 = vmax_s8(vout2x01234567, vget_low_s8(voutput_min));
+
+    vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);
+    vout2x01234567 = vmin_s8(vout2x01234567, vget_low_s8(voutput_max));
+
+    if (nc >= 8) {
+      vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
+      vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
+      vst1_s8(c2 + 0, vout2x01234567);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1_lane_s8(c2, vout2x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/3x8c8-xw-minmax-avx2.c b/src/qs8-gemm/gen/3x8c8-xw-minmax-avx2.c
index 370b8d7..290c6af 100644
--- a/src/qs8-gemm/gen/3x8c8-xw-minmax-avx2.c
+++ b/src/qs8-gemm/gen/3x8c8-xw-minmax-avx2.c
@@ -13,6 +13,7 @@
 
 #include <xnnpack/gemm.h>
 #include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2(
@@ -36,6 +37,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -191,14 +193,14 @@
       _mm_storel_epi64((__m128i*) c1, vout_hi);
       _mm_storeh_pi((__m64*) c2, _mm_castsi128_ps(vout_lo));
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-      a2 = (const int8_t*) ((uintptr_t) a2 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
       nc -= 8;
     } else {
       if (nc & 4) {
diff --git a/src/qs8-gemm/gen/4x16-minmax-neon-mlal-lane.c b/src/qs8-gemm/gen/4x16-minmax-neon-mlal-lane.c
new file mode 100644
index 0000000..2a7b658
--- /dev/null
+++ b/src/qs8-gemm/gen/4x16-minmax-neon-mlal-lane.c
@@ -0,0 +1,597 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/neon-mlal-lane.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/gemm.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride);
+  int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc1x0123 = vacc0x0123;
+    int32x4_t vacc1x4567 = vacc0x4567;
+    int32x4_t vacc1x89AB = vacc0x89AB;
+    int32x4_t vacc1xCDEF = vacc0xCDEF;
+    int32x4_t vacc2x0123 = vacc0x0123;
+    int32x4_t vacc2x4567 = vacc0x4567;
+    int32x4_t vacc2x89AB = vacc0x89AB;
+    int32x4_t vacc2xCDEF = vacc0xCDEF;
+    int32x4_t vacc3x0123 = vacc0x0123;
+    int32x4_t vacc3x4567 = vacc0x4567;
+    int32x4_t vacc3x89AB = vacc0x89AB;
+    int32x4_t vacc3xCDEF = vacc0xCDEF;
+
+    size_t k = kc;
+    while (k >= 8 * sizeof(int8_t)) {
+      const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+      const int16x8_t vxa0 = vmovl_s8(va0);
+      const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+      const int16x8_t vxa1 = vmovl_s8(va1);
+      const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+      const int16x8_t vxa2 = vmovl_s8(va2);
+      const int8x8_t va3 = vld1_s8(a3); a3 += 8;
+      const int16x8_t vxa3 = vmovl_s8(va3);
+
+      const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb01234567c0 = vmovl_s8(vb01234567c0);
+
+      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
+      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
+      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
+      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
+      vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0);
+      vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0);
+      const int8x8_t vb89ABCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0);
+
+      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
+      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
+      vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0);
+      vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0);
+      vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0);
+      vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0);
+      vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0);
+      vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0);
+      const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb01234567c1 = vmovl_s8(vb01234567c1);
+
+      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
+      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
+      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
+      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
+      vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1);
+      vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1);
+      const int8x8_t vb89ABCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1);
+
+      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
+      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
+      vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1);
+      vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1);
+      vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1);
+      vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1);
+      vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1);
+      vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1);
+      const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb01234567c2 = vmovl_s8(vb01234567c2);
+
+      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
+      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
+      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
+      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
+      vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2);
+      vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa3), 2);
+      const int8x8_t vb89ABCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb89ABCDEFc2 = vmovl_s8(vb89ABCDEFc2);
+
+      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
+      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
+      vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa1), 2);
+      vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa1), 2);
+      vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2);
+      vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2);
+      vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2);
+      vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2);
+      const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb01234567c3 = vmovl_s8(vb01234567c3);
+
+      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
+      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
+      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
+      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
+      vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa3), 3);
+      vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa3), 3);
+      const int8x8_t vb89ABCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb89ABCDEFc3 = vmovl_s8(vb89ABCDEFc3);
+
+      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);
+      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);
+      vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa1), 3);
+      vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa1), 3);
+      vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3);
+      vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3);
+      vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3);
+      vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3);
+
+      const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb01234567c4 = vmovl_s8(vb01234567c4);
+
+      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
+      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
+      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
+      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
+      vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa3), 0);
+      vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa3), 0);
+      const int8x8_t vb89ABCDEFc4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb89ABCDEFc4 = vmovl_s8(vb89ABCDEFc4);
+
+      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
+      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
+      vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa1), 0);
+      vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa1), 0);
+      vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0);
+      vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0);
+      vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0);
+      vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0);
+      const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb01234567c5 = vmovl_s8(vb01234567c5);
+
+      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
+      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
+      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
+      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
+      vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa3), 1);
+      vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa3), 1);
+      const int8x8_t vb89ABCDEFc5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb89ABCDEFc5 = vmovl_s8(vb89ABCDEFc5);
+
+      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
+      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
+      vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa1), 1);
+      vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa1), 1);
+      vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1);
+      vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1);
+      vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1);
+      vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1);
+      const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb01234567c6 = vmovl_s8(vb01234567c6);
+
+      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
+      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
+      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
+      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
+      vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa3), 2);
+      vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa3), 2);
+      const int8x8_t vb89ABCDEFc6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb89ABCDEFc6 = vmovl_s8(vb89ABCDEFc6);
+
+      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
+      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
+      vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa1), 2);
+      vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa1), 2);
+      vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2);
+      vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2);
+      vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2);
+      vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2);
+      const int8x8_t vb01234567c7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb01234567c7 = vmovl_s8(vb01234567c7);
+
+      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
+      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
+      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa1), 3);
+      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa1), 3);
+      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa2), 3);
+      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa2), 3);
+      vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa3), 3);
+      vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa3), 3);
+      const int8x8_t vb89ABCDEFc7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb89ABCDEFc7 = vmovl_s8(vb89ABCDEFc7);
+
+      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3);
+      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3);
+      vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa1), 3);
+      vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa1), 3);
+      vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa2), 3);
+      vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa2), 3);
+      vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa3), 3);
+      vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa3), 3);
+
+      k -= 8 * sizeof(int8_t);
+    }
+    if XNN_UNLIKELY(k != 0) {
+      const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+      const int16x8_t vxa0 = vmovl_s8(va0);
+      const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+      const int16x8_t vxa1 = vmovl_s8(va1);
+      const int8x8_t va2 = vld1_s8(a2); a2 = (const int8_t*) ((uintptr_t) a2 + k);
+      const int16x8_t vxa2 = vmovl_s8(va2);
+      const int8x8_t va3 = vld1_s8(a3); a3 = (const int8_t*) ((uintptr_t) a3 + k);
+      const int16x8_t vxa3 = vmovl_s8(va3);
+
+      const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb01234567c0 = vmovl_s8(vb01234567c0);
+      const int8x8_t vb89ABCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0);
+
+      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
+      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
+      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
+      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
+      vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0);
+      vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0);
+      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
+      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
+      vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0);
+      vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0);
+      vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0);
+      vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0);
+      vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0);
+      vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0);
+
+      if (k >= 2 * sizeof(int8_t)) {
+        const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb01234567c1 = vmovl_s8(vb01234567c1);
+        const int8x8_t vb89ABCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1);
+
+        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+        vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
+        vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
+        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
+        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
+        vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1);
+        vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1);
+        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
+        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
+        vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1);
+        vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1);
+        vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1);
+        vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1);
+        vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1);
+        vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1);
+
+        if (k > 2 * sizeof(int8_t)) {
+          const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int16x8_t vxb01234567c2 = vmovl_s8(vb01234567c2);
+          const int8x8_t vb89ABCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int16x8_t vxb89ABCDEFc2 = vmovl_s8(vb89ABCDEFc2);
+
+          vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+          vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+          vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
+          vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
+          vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
+          vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
+          vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa1), 2);
+          vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa1), 2);
+          vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
+          vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
+          vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2);
+          vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2);
+          vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2);
+          vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa3), 2);
+          vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2);
+          vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2);
+
+          if (k >= 4 * sizeof(int8_t)) {
+            const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            const int16x8_t vxb01234567c3 = vmovl_s8(vb01234567c3);
+            const int8x8_t vb89ABCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            const int16x8_t vxb89ABCDEFc3 = vmovl_s8(vb89ABCDEFc3);
+
+            vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+            vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+            vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);
+            vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);
+            vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
+            vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
+            vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa1), 3);
+            vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa1), 3);
+            vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
+            vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
+            vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3);
+            vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3);
+            vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa3), 3);
+            vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa3), 3);
+            vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3);
+            vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3);
+
+            if (k > 4 * sizeof(int8_t)) {
+              const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+              const int16x8_t vxb01234567c4 = vmovl_s8(vb01234567c4);
+              const int8x8_t vb89ABCDEFc4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+              const int16x8_t vxb89ABCDEFc4 = vmovl_s8(vb89ABCDEFc4);
+
+              vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+              vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+              vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
+              vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
+              vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
+              vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
+              vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa1), 0);
+              vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa1), 0);
+              vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
+              vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
+              vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0);
+              vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0);
+              vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa3), 0);
+              vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa3), 0);
+              vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0);
+              vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0);
+
+              if (k >= 6 * sizeof(int8_t)) {
+                const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+                const int16x8_t vxb01234567c5 = vmovl_s8(vb01234567c5);
+                const int8x8_t vb89ABCDEFc5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+                const int16x8_t vxb89ABCDEFc5 = vmovl_s8(vb89ABCDEFc5);
+
+                vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+                vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+                vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
+                vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
+                vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
+                vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
+                vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa1), 1);
+                vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa1), 1);
+                vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
+                vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
+                vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1);
+                vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1);
+                vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa3), 1);
+                vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa3), 1);
+                vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1);
+                vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1);
+
+                if (k > 6 * sizeof(int8_t)) {
+                  const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+                  const int16x8_t vxb01234567c6 = vmovl_s8(vb01234567c6);
+                  const int8x8_t vb89ABCDEFc6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+                  const int16x8_t vxb89ABCDEFc6 = vmovl_s8(vb89ABCDEFc6);
+
+                  vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+                  vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+                  vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
+                  vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
+                  vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
+                  vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
+                  vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa1), 2);
+                  vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa1), 2);
+                  vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
+                  vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
+                  vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2);
+                  vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2);
+                  vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa3), 2);
+                  vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa3), 2);
+                  vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2);
+                  vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+    vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc1x89AB = vqrdmulhq_s32(vacc1x89AB, vmultiplier);
+    vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+    vacc2x89AB = vqrdmulhq_s32(vacc2x89AB, vmultiplier);
+    vacc2xCDEF = vqrdmulhq_s32(vacc2xCDEF, vmultiplier);
+    vacc3x0123 = vqrdmulhq_s32(vacc3x0123, vmultiplier);
+    vacc3x4567 = vqrdmulhq_s32(vacc3x4567, vmultiplier);
+    vacc3x89AB = vqrdmulhq_s32(vacc3x89AB, vmultiplier);
+    vacc3xCDEF = vqrdmulhq_s32(vacc3xCDEF, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+    vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc1x89AB = vsraq_n_s32(vacc1x89AB, vbicq_s32(vacc1x89AB, vzero_shift_mask), 31);
+    vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+    vacc2x89AB = vsraq_n_s32(vacc2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31);
+    vacc2xCDEF = vsraq_n_s32(vacc2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31);
+    vacc3x0123 = vsraq_n_s32(vacc3x0123, vbicq_s32(vacc3x0123, vzero_shift_mask), 31);
+    vacc3x4567 = vsraq_n_s32(vacc3x4567, vbicq_s32(vacc3x4567, vzero_shift_mask), 31);
+    vacc3x89AB = vsraq_n_s32(vacc3x89AB, vbicq_s32(vacc3x89AB, vzero_shift_mask), 31);
+    vacc3xCDEF = vsraq_n_s32(vacc3xCDEF, vbicq_s32(vacc3xCDEF, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_shift);
+    vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+    vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_shift);
+    vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_shift);
+    vacc3x0123 = vrshlq_s32(vacc3x0123, vright_shift);
+    vacc3x4567 = vrshlq_s32(vacc3x4567, vright_shift);
+    vacc3x89AB = vrshlq_s32(vacc3x89AB, vright_shift);
+    vacc3xCDEF = vrshlq_s32(vacc3xCDEF, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
+    const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+    int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
+    int8x16_t vout2x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc2x89ABCDEF);
+    int8x16_t vout3x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc3x01234567), vacc3x89ABCDEF);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567)), voutput_zero_point);
+    const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF)), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+    int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
+    int8x16_t vout2x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc2x89ABCDEF));
+    int8x16_t vout3x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc3x01234567), vqmovn_s16(vacc3x89ABCDEF));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+    vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
+    vout2x0123456789ABCDEF = vmaxq_s8(vout2x0123456789ABCDEF, voutput_min);
+    vout3x0123456789ABCDEF = vmaxq_s8(vout3x0123456789ABCDEF, voutput_min);
+
+    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+    vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
+    vout2x0123456789ABCDEF = vminq_s8(vout2x0123456789ABCDEF, voutput_max);
+    vout3x0123456789ABCDEF = vminq_s8(vout3x0123456789ABCDEF, voutput_max);
+
+    if (nc >= 16) {
+      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+      vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
+      vst1q_s8(c2 + 0, vout2x0123456789ABCDEF);
+      vst1q_s8(c3 + 0, vout3x0123456789ABCDEF);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+
+      nc -= 16;
+    } else {
+      int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
+      int8x16_t vout2x01234567_3x01234567 = vcombine_s8(vget_low_s8(vout2x0123456789ABCDEF), vget_low_s8(vout3x0123456789ABCDEF));
+      if (nc & 8) {
+        vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
+        vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
+        vst1_s8(c2, vget_low_s8(vout2x01234567_3x01234567)); c2 += 8;
+        vst1_s8(c3, vget_high_s8(vout2x01234567_3x01234567)); c3 += 8;
+        vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
+        vout2x01234567_3x01234567 = vcombine_s8(vget_high_s8(vout2x0123456789ABCDEF), vget_high_s8(vout3x0123456789ABCDEF));
+      }
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 0); c2 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c3, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 2); c3 += 4;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 0); c2 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c3, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 4); c3 += 2;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c2, vout2x01234567_3x01234567, 0);
+        vst1q_lane_s8(c3, vout2x01234567_3x01234567, 8);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/4x16-minmax-neon-mull-addw-dup.c b/src/qs8-gemm/gen/4x16-minmax-neon-mull-addw-dup.c
new file mode 100644
index 0000000..cd1d411
--- /dev/null
+++ b/src/qs8-gemm/gen/4x16-minmax-neon-mull-addw-dup.c
@@ -0,0 +1,677 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/neon-mull-addw-dup.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/gemm.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride);
+  int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc1x0123 = vacc0x0123;
+    int32x4_t vacc1x4567 = vacc0x4567;
+    int32x4_t vacc1x89AB = vacc0x89AB;
+    int32x4_t vacc1xCDEF = vacc0xCDEF;
+    int32x4_t vacc2x0123 = vacc0x0123;
+    int32x4_t vacc2x4567 = vacc0x4567;
+    int32x4_t vacc2x89AB = vacc0x89AB;
+    int32x4_t vacc2xCDEF = vacc0xCDEF;
+    int32x4_t vacc3x0123 = vacc0x0123;
+    int32x4_t vacc3x4567 = vacc0x4567;
+    int32x4_t vacc3x89AB = vacc0x89AB;
+    int32x4_t vacc3xCDEF = vacc0xCDEF;
+
+    size_t k = kc;
+    while (k >= 8 * sizeof(int8_t)) {
+      const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+      const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+      const int8x8_t va3 = vld1_s8(a3); a3 += 8;
+
+      const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c0));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c0));
+      const int16x8_t vprod1x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va1, 0));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c0));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c0));
+      const int16x8_t vprod2x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va2, 0));
+      vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c0));
+      vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c0));
+      const int16x8_t vprod3x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va3, 0));
+      vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c0));
+      vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c0));
+      const int8x8_t vb89ABCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va0, 0));
+      vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc0));
+      vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc0));
+      const int16x8_t vprod1x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va1, 0));
+      vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc0));
+      vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc0));
+      const int16x8_t vprod2x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va2, 0));
+      vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc0));
+      vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc0));
+      const int16x8_t vprod3x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va3, 0));
+      vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc0));
+      vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc0));
+      const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c1));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c1));
+      const int16x8_t vprod1x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va1, 1));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c1));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c1));
+      const int16x8_t vprod2x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va2, 1));
+      vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c1));
+      vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c1));
+      const int16x8_t vprod3x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va3, 1));
+      vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c1));
+      vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c1));
+      const int8x8_t vb89ABCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va0, 1));
+      vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc1));
+      vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc1));
+      const int16x8_t vprod1x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va1, 1));
+      vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc1));
+      vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc1));
+      const int16x8_t vprod2x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va2, 1));
+      vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc1));
+      vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc1));
+      const int16x8_t vprod3x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va3, 1));
+      vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc1));
+      vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc1));
+      const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va0, 2));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c2));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c2));
+      const int16x8_t vprod1x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va1, 2));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c2));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c2));
+      const int16x8_t vprod2x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va2, 2));
+      vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c2));
+      vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c2));
+      const int16x8_t vprod3x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va3, 2));
+      vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c2));
+      vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c2));
+      const int8x8_t vb89ABCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va0, 2));
+      vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc2));
+      vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc2));
+      const int16x8_t vprod1x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va1, 2));
+      vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc2));
+      vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc2));
+      const int16x8_t vprod2x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va2, 2));
+      vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc2));
+      vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc2));
+      const int16x8_t vprod3x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va3, 2));
+      vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc2));
+      vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc2));
+      const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3));
+      const int16x8_t vprod1x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va1, 3));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c3));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c3));
+      const int16x8_t vprod2x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va2, 3));
+      vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c3));
+      vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c3));
+      const int16x8_t vprod3x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va3, 3));
+      vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c3));
+      vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c3));
+      const int8x8_t vb89ABCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va0, 3));
+      vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc3));
+      vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc3));
+      const int16x8_t vprod1x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va1, 3));
+      vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc3));
+      vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc3));
+      const int16x8_t vprod2x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va2, 3));
+      vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc3));
+      vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc3));
+      const int16x8_t vprod3x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va3, 3));
+      vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc3));
+      vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc3));
+      const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va0, 4));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c4));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c4));
+      const int16x8_t vprod1x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va1, 4));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c4));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c4));
+      const int16x8_t vprod2x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va2, 4));
+      vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c4));
+      vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c4));
+      const int16x8_t vprod3x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va3, 4));
+      vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c4));
+      vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c4));
+      const int8x8_t vb89ABCDEFc4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va0, 4));
+      vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc4));
+      vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc4));
+      const int16x8_t vprod1x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va1, 4));
+      vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc4));
+      vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc4));
+      const int16x8_t vprod2x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va2, 4));
+      vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc4));
+      vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc4));
+      const int16x8_t vprod3x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va3, 4));
+      vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc4));
+      vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc4));
+      const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va0, 5));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c5));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c5));
+      const int16x8_t vprod1x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va1, 5));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c5));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c5));
+      const int16x8_t vprod2x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va2, 5));
+      vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c5));
+      vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c5));
+      const int16x8_t vprod3x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va3, 5));
+      vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c5));
+      vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c5));
+      const int8x8_t vb89ABCDEFc5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va0, 5));
+      vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc5));
+      vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc5));
+      const int16x8_t vprod1x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va1, 5));
+      vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc5));
+      vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc5));
+      const int16x8_t vprod2x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va2, 5));
+      vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc5));
+      vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc5));
+      const int16x8_t vprod3x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va3, 5));
+      vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc5));
+      vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc5));
+      const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va0, 6));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c6));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c6));
+      const int16x8_t vprod1x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va1, 6));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c6));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c6));
+      const int16x8_t vprod2x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va2, 6));
+      vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c6));
+      vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c6));
+      const int16x8_t vprod3x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va3, 6));
+      vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c6));
+      vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c6));
+      const int8x8_t vb89ABCDEFc6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va0, 6));
+      vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc6));
+      vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc6));
+      const int16x8_t vprod1x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va1, 6));
+      vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc6));
+      vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc6));
+      const int16x8_t vprod2x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va2, 6));
+      vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc6));
+      vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc6));
+      const int16x8_t vprod3x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va3, 6));
+      vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc6));
+      vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc6));
+      const int8x8_t vb01234567c7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va0, 7));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c7));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c7));
+      const int16x8_t vprod1x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va1, 7));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c7));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c7));
+      const int16x8_t vprod2x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va2, 7));
+      vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c7));
+      vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c7));
+      const int16x8_t vprod3x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va3, 7));
+      vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c7));
+      vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c7));
+      const int8x8_t vb89ABCDEFc7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x89ABCDEFc7 = vmull_s8(vb89ABCDEFc7, vdup_lane_s8(va0, 7));
+      vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc7));
+      vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc7));
+      const int16x8_t vprod1x89ABCDEFc7 = vmull_s8(vb89ABCDEFc7, vdup_lane_s8(va1, 7));
+      vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc7));
+      vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc7));
+      const int16x8_t vprod2x89ABCDEFc7 = vmull_s8(vb89ABCDEFc7, vdup_lane_s8(va2, 7));
+      vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc7));
+      vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc7));
+      const int16x8_t vprod3x89ABCDEFc7 = vmull_s8(vb89ABCDEFc7, vdup_lane_s8(va3, 7));
+      vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc7));
+      vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc7));
+
+      k -= 8 * sizeof(int8_t);
+    }
+    if XNN_UNLIKELY(k != 0) {
+      const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+      const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+      const int8x8_t va2 = vld1_s8(a2); a2 = (const int8_t*) ((uintptr_t) a2 + k);
+      const int8x8_t va3 = vld1_s8(a3); a3 = (const int8_t*) ((uintptr_t) a3 + k);
+
+      const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c0));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c0));
+      const int16x8_t vprod0x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va0, 0));
+      vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc0));
+      vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc0));
+      const int16x8_t vprod1x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va1, 0));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c0));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c0));
+      const int16x8_t vprod1x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va1, 0));
+      vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc0));
+      vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc0));
+      const int16x8_t vprod2x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va2, 0));
+      vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c0));
+      vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c0));
+      const int16x8_t vprod2x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va2, 0));
+      vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc0));
+      vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc0));
+      const int16x8_t vprod3x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va3, 0));
+      vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c0));
+      vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c0));
+      const int16x8_t vprod3x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va3, 0));
+      vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc0));
+      vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc0));
+
+      if (k >= 2 * sizeof(int8_t)) {
+        const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c1));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c1));
+        const int16x8_t vprod0x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va0, 1));
+        vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc1));
+        vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc1));
+        const int16x8_t vprod1x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va1, 1));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c1));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c1));
+        const int16x8_t vprod1x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va1, 1));
+        vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc1));
+        vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc1));
+        const int16x8_t vprod2x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va2, 1));
+        vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c1));
+        vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c1));
+        const int16x8_t vprod2x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va2, 1));
+        vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc1));
+        vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc1));
+        const int16x8_t vprod3x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va3, 1));
+        vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c1));
+        vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c1));
+        const int16x8_t vprod3x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va3, 1));
+        vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc1));
+        vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc1));
+
+        if (k > 2 * sizeof(int8_t)) {
+          const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb89ABCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          const int16x8_t vprod0x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va0, 2));
+          vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c2));
+          vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c2));
+          const int16x8_t vprod0x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va0, 2));
+          vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc2));
+          vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc2));
+          const int16x8_t vprod1x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va1, 2));
+          vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c2));
+          vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c2));
+          const int16x8_t vprod1x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va1, 2));
+          vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc2));
+          vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc2));
+          const int16x8_t vprod2x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va2, 2));
+          vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c2));
+          vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c2));
+          const int16x8_t vprod2x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va2, 2));
+          vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc2));
+          vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc2));
+          const int16x8_t vprod3x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va3, 2));
+          vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c2));
+          vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c2));
+          const int16x8_t vprod3x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va3, 2));
+          vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc2));
+          vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc2));
+
+          if (k >= 4 * sizeof(int8_t)) {
+            const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            const int8x8_t vb89ABCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+            const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3));
+            vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3));
+            vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3));
+            const int16x8_t vprod0x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va0, 3));
+            vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc3));
+            vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc3));
+            const int16x8_t vprod1x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va1, 3));
+            vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c3));
+            vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c3));
+            const int16x8_t vprod1x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va1, 3));
+            vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc3));
+            vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc3));
+            const int16x8_t vprod2x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va2, 3));
+            vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c3));
+            vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c3));
+            const int16x8_t vprod2x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va2, 3));
+            vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc3));
+            vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc3));
+            const int16x8_t vprod3x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va3, 3));
+            vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c3));
+            vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c3));
+            const int16x8_t vprod3x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va3, 3));
+            vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc3));
+            vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc3));
+
+            if (k > 4 * sizeof(int8_t)) {
+              const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+              const int8x8_t vb89ABCDEFc4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+              const int16x8_t vprod0x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va0, 4));
+              vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c4));
+              vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c4));
+              const int16x8_t vprod0x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va0, 4));
+              vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc4));
+              vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc4));
+              const int16x8_t vprod1x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va1, 4));
+              vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c4));
+              vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c4));
+              const int16x8_t vprod1x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va1, 4));
+              vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc4));
+              vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc4));
+              const int16x8_t vprod2x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va2, 4));
+              vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c4));
+              vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c4));
+              const int16x8_t vprod2x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va2, 4));
+              vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc4));
+              vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc4));
+              const int16x8_t vprod3x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va3, 4));
+              vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c4));
+              vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c4));
+              const int16x8_t vprod3x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va3, 4));
+              vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc4));
+              vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc4));
+
+              if (k >= 6 * sizeof(int8_t)) {
+                const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+                const int8x8_t vb89ABCDEFc5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+                const int16x8_t vprod0x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va0, 5));
+                vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c5));
+                vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c5));
+                const int16x8_t vprod0x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va0, 5));
+                vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc5));
+                vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc5));
+                const int16x8_t vprod1x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va1, 5));
+                vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c5));
+                vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c5));
+                const int16x8_t vprod1x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va1, 5));
+                vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc5));
+                vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc5));
+                const int16x8_t vprod2x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va2, 5));
+                vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c5));
+                vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c5));
+                const int16x8_t vprod2x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va2, 5));
+                vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc5));
+                vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc5));
+                const int16x8_t vprod3x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va3, 5));
+                vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c5));
+                vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c5));
+                const int16x8_t vprod3x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va3, 5));
+                vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc5));
+                vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc5));
+
+                if (k > 6 * sizeof(int8_t)) {
+                  const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+                  const int8x8_t vb89ABCDEFc6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+                  const int16x8_t vprod0x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va0, 6));
+                  vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c6));
+                  vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c6));
+                  const int16x8_t vprod0x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va0, 6));
+                  vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc6));
+                  vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc6));
+                  const int16x8_t vprod1x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va1, 6));
+                  vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c6));
+                  vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c6));
+                  const int16x8_t vprod1x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va1, 6));
+                  vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc6));
+                  vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc6));
+                  const int16x8_t vprod2x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va2, 6));
+                  vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c6));
+                  vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c6));
+                  const int16x8_t vprod2x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va2, 6));
+                  vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc6));
+                  vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc6));
+                  const int16x8_t vprod3x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va3, 6));
+                  vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c6));
+                  vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c6));
+                  const int16x8_t vprod3x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va3, 6));
+                  vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc6));
+                  vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc6));
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+    vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc1x89AB = vqrdmulhq_s32(vacc1x89AB, vmultiplier);
+    vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+    vacc2x89AB = vqrdmulhq_s32(vacc2x89AB, vmultiplier);
+    vacc2xCDEF = vqrdmulhq_s32(vacc2xCDEF, vmultiplier);
+    vacc3x0123 = vqrdmulhq_s32(vacc3x0123, vmultiplier);
+    vacc3x4567 = vqrdmulhq_s32(vacc3x4567, vmultiplier);
+    vacc3x89AB = vqrdmulhq_s32(vacc3x89AB, vmultiplier);
+    vacc3xCDEF = vqrdmulhq_s32(vacc3xCDEF, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+    vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc1x89AB = vsraq_n_s32(vacc1x89AB, vbicq_s32(vacc1x89AB, vzero_shift_mask), 31);
+    vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+    vacc2x89AB = vsraq_n_s32(vacc2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31);
+    vacc2xCDEF = vsraq_n_s32(vacc2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31);
+    vacc3x0123 = vsraq_n_s32(vacc3x0123, vbicq_s32(vacc3x0123, vzero_shift_mask), 31);
+    vacc3x4567 = vsraq_n_s32(vacc3x4567, vbicq_s32(vacc3x4567, vzero_shift_mask), 31);
+    vacc3x89AB = vsraq_n_s32(vacc3x89AB, vbicq_s32(vacc3x89AB, vzero_shift_mask), 31);
+    vacc3xCDEF = vsraq_n_s32(vacc3xCDEF, vbicq_s32(vacc3xCDEF, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_shift);
+    vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+    vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_shift);
+    vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_shift);
+    vacc3x0123 = vrshlq_s32(vacc3x0123, vright_shift);
+    vacc3x4567 = vrshlq_s32(vacc3x4567, vright_shift);
+    vacc3x89AB = vrshlq_s32(vacc3x89AB, vright_shift);
+    vacc3xCDEF = vrshlq_s32(vacc3xCDEF, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
+    const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+    int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
+    int8x16_t vout2x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc2x89ABCDEF);
+    int8x16_t vout3x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc3x01234567), vacc3x89ABCDEF);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567)), voutput_zero_point);
+    const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF)), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+    int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
+    int8x16_t vout2x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc2x89ABCDEF));
+    int8x16_t vout3x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc3x01234567), vqmovn_s16(vacc3x89ABCDEF));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+    vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
+    vout2x0123456789ABCDEF = vmaxq_s8(vout2x0123456789ABCDEF, voutput_min);
+    vout3x0123456789ABCDEF = vmaxq_s8(vout3x0123456789ABCDEF, voutput_min);
+
+    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+    vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
+    vout2x0123456789ABCDEF = vminq_s8(vout2x0123456789ABCDEF, voutput_max);
+    vout3x0123456789ABCDEF = vminq_s8(vout3x0123456789ABCDEF, voutput_max);
+
+    if (nc >= 16) {
+      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+      vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
+      vst1q_s8(c2 + 0, vout2x0123456789ABCDEF);
+      vst1q_s8(c3 + 0, vout3x0123456789ABCDEF);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+
+      nc -= 16;
+    } else {
+      int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
+      int8x16_t vout2x01234567_3x01234567 = vcombine_s8(vget_low_s8(vout2x0123456789ABCDEF), vget_low_s8(vout3x0123456789ABCDEF));
+      if (nc & 8) {
+        vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
+        vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
+        vst1_s8(c2, vget_low_s8(vout2x01234567_3x01234567)); c2 += 8;
+        vst1_s8(c3, vget_high_s8(vout2x01234567_3x01234567)); c3 += 8;
+        vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
+        vout2x01234567_3x01234567 = vcombine_s8(vget_high_s8(vout2x0123456789ABCDEF), vget_high_s8(vout3x0123456789ABCDEF));
+      }
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 0); c2 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c3, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 2); c3 += 4;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 0); c2 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c3, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 4); c3 += 2;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c2, vout2x01234567_3x01234567, 0);
+        vst1q_lane_s8(c3, vout2x01234567_3x01234567, 8);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/4x16c16-minmax-neon-mlal-padal.c b/src/qs8-gemm/gen/4x16c16-minmax-neon-mlal-padal.c
new file mode 100644
index 0000000..53cbee3
--- /dev/null
+++ b/src/qs8-gemm/gen/4x16c16-minmax-neon-mlal-padal.c
@@ -0,0 +1,662 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/c16-neon-mlal-padal.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 16);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride);
+  int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+
+  do {
+    int32x4_t vacc0x0 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x1 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x2 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x3 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x4 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x5 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x8 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x9 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x10 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x11 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x12 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x13 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x14 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x15 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc1x0 = vacc0x0;
+    int32x4_t vacc1x1 = vacc0x1;
+    int32x4_t vacc1x2 = vacc0x2;
+    int32x4_t vacc1x3 = vacc0x3;
+    int32x4_t vacc1x4 = vacc0x4;
+    int32x4_t vacc1x5 = vacc0x5;
+    int32x4_t vacc1x6 = vacc0x6;
+    int32x4_t vacc1x7 = vacc0x7;
+    int32x4_t vacc1x8 = vacc0x8;
+    int32x4_t vacc1x9 = vacc0x9;
+    int32x4_t vacc1x10 = vacc0x10;
+    int32x4_t vacc1x11 = vacc0x11;
+    int32x4_t vacc1x12 = vacc0x12;
+    int32x4_t vacc1x13 = vacc0x13;
+    int32x4_t vacc1x14 = vacc0x14;
+    int32x4_t vacc1x15 = vacc0x15;
+    int32x4_t vacc2x0 = vacc0x0;
+    int32x4_t vacc2x1 = vacc0x1;
+    int32x4_t vacc2x2 = vacc0x2;
+    int32x4_t vacc2x3 = vacc0x3;
+    int32x4_t vacc2x4 = vacc0x4;
+    int32x4_t vacc2x5 = vacc0x5;
+    int32x4_t vacc2x6 = vacc0x6;
+    int32x4_t vacc2x7 = vacc0x7;
+    int32x4_t vacc2x8 = vacc0x8;
+    int32x4_t vacc2x9 = vacc0x9;
+    int32x4_t vacc2x10 = vacc0x10;
+    int32x4_t vacc2x11 = vacc0x11;
+    int32x4_t vacc2x12 = vacc0x12;
+    int32x4_t vacc2x13 = vacc0x13;
+    int32x4_t vacc2x14 = vacc0x14;
+    int32x4_t vacc2x15 = vacc0x15;
+    int32x4_t vacc3x0 = vacc0x0;
+    int32x4_t vacc3x1 = vacc0x1;
+    int32x4_t vacc3x2 = vacc0x2;
+    int32x4_t vacc3x3 = vacc0x3;
+    int32x4_t vacc3x4 = vacc0x4;
+    int32x4_t vacc3x5 = vacc0x5;
+    int32x4_t vacc3x6 = vacc0x6;
+    int32x4_t vacc3x7 = vacc0x7;
+    int32x4_t vacc3x8 = vacc0x8;
+    int32x4_t vacc3x9 = vacc0x9;
+    int32x4_t vacc3x10 = vacc0x10;
+    int32x4_t vacc3x11 = vacc0x11;
+    int32x4_t vacc3x12 = vacc0x12;
+    int32x4_t vacc3x13 = vacc0x13;
+    int32x4_t vacc3x14 = vacc0x14;
+    int32x4_t vacc3x15 = vacc0x15;
+
+    // KC loop of 16
+    size_t k = 0;
+    while (k < kc) {
+      const int8x16_t va0 = vld1q_s8(a0); a0 += 16;
+      const int8x16_t va1 = vld1q_s8(a1); a1 += 16;
+      const int8x16_t va2 = vld1q_s8(a2); a2 += 16;
+      const int8x16_t va3 = vld1q_s8(a3); a3 += 16;
+
+      const int8x16_t vb0 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb1 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb2 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb3 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb4 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb5 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb6 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb7 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb8 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb9 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb10 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb11 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb12 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb13 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb14 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb15 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+
+      int16x8_t vprod0x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va0));
+      int16x8_t vprod1x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va1));
+      int16x8_t vprod2x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va2));
+      int16x8_t vprod3x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va3));
+      vprod0x0 = vmlal_s8(vprod0x0, vget_high_s8(vb0), vget_high_s8(va0));
+      vprod1x0 = vmlal_s8(vprod1x0, vget_high_s8(vb0), vget_high_s8(va1));
+      vprod2x0 = vmlal_s8(vprod2x0, vget_high_s8(vb0), vget_high_s8(va2));
+      vprod3x0 = vmlal_s8(vprod3x0, vget_high_s8(vb0), vget_high_s8(va3));
+      vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+      vacc1x0 = vpadalq_s16(vacc1x0, vprod1x0);
+      vacc2x0 = vpadalq_s16(vacc2x0, vprod2x0);
+      vacc3x0 = vpadalq_s16(vacc3x0, vprod3x0);
+      int16x8_t vprod0x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va0));
+      int16x8_t vprod1x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va1));
+      int16x8_t vprod2x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va2));
+      int16x8_t vprod3x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va3));
+      vprod0x1 = vmlal_s8(vprod0x1, vget_high_s8(vb1), vget_high_s8(va0));
+      vprod1x1 = vmlal_s8(vprod1x1, vget_high_s8(vb1), vget_high_s8(va1));
+      vprod2x1 = vmlal_s8(vprod2x1, vget_high_s8(vb1), vget_high_s8(va2));
+      vprod3x1 = vmlal_s8(vprod3x1, vget_high_s8(vb1), vget_high_s8(va3));
+      vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+      vacc1x1 = vpadalq_s16(vacc1x1, vprod1x1);
+      vacc2x1 = vpadalq_s16(vacc2x1, vprod2x1);
+      vacc3x1 = vpadalq_s16(vacc3x1, vprod3x1);
+      int16x8_t vprod0x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va0));
+      int16x8_t vprod1x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va1));
+      int16x8_t vprod2x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va2));
+      int16x8_t vprod3x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va3));
+      vprod0x2 = vmlal_s8(vprod0x2, vget_high_s8(vb2), vget_high_s8(va0));
+      vprod1x2 = vmlal_s8(vprod1x2, vget_high_s8(vb2), vget_high_s8(va1));
+      vprod2x2 = vmlal_s8(vprod2x2, vget_high_s8(vb2), vget_high_s8(va2));
+      vprod3x2 = vmlal_s8(vprod3x2, vget_high_s8(vb2), vget_high_s8(va3));
+      vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+      vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2);
+      vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2);
+      vacc3x2 = vpadalq_s16(vacc3x2, vprod3x2);
+      int16x8_t vprod0x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va0));
+      int16x8_t vprod1x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va1));
+      int16x8_t vprod2x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va2));
+      int16x8_t vprod3x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va3));
+      vprod0x3 = vmlal_s8(vprod0x3, vget_high_s8(vb3), vget_high_s8(va0));
+      vprod1x3 = vmlal_s8(vprod1x3, vget_high_s8(vb3), vget_high_s8(va1));
+      vprod2x3 = vmlal_s8(vprod2x3, vget_high_s8(vb3), vget_high_s8(va2));
+      vprod3x3 = vmlal_s8(vprod3x3, vget_high_s8(vb3), vget_high_s8(va3));
+      vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+      vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3);
+      vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3);
+      vacc3x3 = vpadalq_s16(vacc3x3, vprod3x3);
+      int16x8_t vprod0x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va0));
+      int16x8_t vprod1x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va1));
+      int16x8_t vprod2x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va2));
+      int16x8_t vprod3x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va3));
+      vprod0x4 = vmlal_s8(vprod0x4, vget_high_s8(vb4), vget_high_s8(va0));
+      vprod1x4 = vmlal_s8(vprod1x4, vget_high_s8(vb4), vget_high_s8(va1));
+      vprod2x4 = vmlal_s8(vprod2x4, vget_high_s8(vb4), vget_high_s8(va2));
+      vprod3x4 = vmlal_s8(vprod3x4, vget_high_s8(vb4), vget_high_s8(va3));
+      vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+      vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4);
+      vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4);
+      vacc3x4 = vpadalq_s16(vacc3x4, vprod3x4);
+      int16x8_t vprod0x5 = vmull_s8(vget_low_s8(vb5), vget_low_s8(va0));
+      int16x8_t vprod1x5 = vmull_s8(vget_low_s8(vb5), vget_low_s8(va1));
+      int16x8_t vprod2x5 = vmull_s8(vget_low_s8(vb5), vget_low_s8(va2));
+      int16x8_t vprod3x5 = vmull_s8(vget_low_s8(vb5), vget_low_s8(va3));
+      vprod0x5 = vmlal_s8(vprod0x5, vget_high_s8(vb5), vget_high_s8(va0));
+      vprod1x5 = vmlal_s8(vprod1x5, vget_high_s8(vb5), vget_high_s8(va1));
+      vprod2x5 = vmlal_s8(vprod2x5, vget_high_s8(vb5), vget_high_s8(va2));
+      vprod3x5 = vmlal_s8(vprod3x5, vget_high_s8(vb5), vget_high_s8(va3));
+      vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+      vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5);
+      vacc2x5 = vpadalq_s16(vacc2x5, vprod2x5);
+      vacc3x5 = vpadalq_s16(vacc3x5, vprod3x5);
+      int16x8_t vprod0x6 = vmull_s8(vget_low_s8(vb6), vget_low_s8(va0));
+      int16x8_t vprod1x6 = vmull_s8(vget_low_s8(vb6), vget_low_s8(va1));
+      int16x8_t vprod2x6 = vmull_s8(vget_low_s8(vb6), vget_low_s8(va2));
+      int16x8_t vprod3x6 = vmull_s8(vget_low_s8(vb6), vget_low_s8(va3));
+      vprod0x6 = vmlal_s8(vprod0x6, vget_high_s8(vb6), vget_high_s8(va0));
+      vprod1x6 = vmlal_s8(vprod1x6, vget_high_s8(vb6), vget_high_s8(va1));
+      vprod2x6 = vmlal_s8(vprod2x6, vget_high_s8(vb6), vget_high_s8(va2));
+      vprod3x6 = vmlal_s8(vprod3x6, vget_high_s8(vb6), vget_high_s8(va3));
+      vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+      vacc1x6 = vpadalq_s16(vacc1x6, vprod1x6);
+      vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6);
+      vacc3x6 = vpadalq_s16(vacc3x6, vprod3x6);
+      int16x8_t vprod0x7 = vmull_s8(vget_low_s8(vb7), vget_low_s8(va0));
+      int16x8_t vprod1x7 = vmull_s8(vget_low_s8(vb7), vget_low_s8(va1));
+      int16x8_t vprod2x7 = vmull_s8(vget_low_s8(vb7), vget_low_s8(va2));
+      int16x8_t vprod3x7 = vmull_s8(vget_low_s8(vb7), vget_low_s8(va3));
+      vprod0x7 = vmlal_s8(vprod0x7, vget_high_s8(vb7), vget_high_s8(va0));
+      vprod1x7 = vmlal_s8(vprod1x7, vget_high_s8(vb7), vget_high_s8(va1));
+      vprod2x7 = vmlal_s8(vprod2x7, vget_high_s8(vb7), vget_high_s8(va2));
+      vprod3x7 = vmlal_s8(vprod3x7, vget_high_s8(vb7), vget_high_s8(va3));
+      vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+      vacc1x7 = vpadalq_s16(vacc1x7, vprod1x7);
+      vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7);
+      vacc3x7 = vpadalq_s16(vacc3x7, vprod3x7);
+      int16x8_t vprod0x8 = vmull_s8(vget_low_s8(vb8), vget_low_s8(va0));
+      int16x8_t vprod1x8 = vmull_s8(vget_low_s8(vb8), vget_low_s8(va1));
+      int16x8_t vprod2x8 = vmull_s8(vget_low_s8(vb8), vget_low_s8(va2));
+      int16x8_t vprod3x8 = vmull_s8(vget_low_s8(vb8), vget_low_s8(va3));
+      vprod0x8 = vmlal_s8(vprod0x8, vget_high_s8(vb8), vget_high_s8(va0));
+      vprod1x8 = vmlal_s8(vprod1x8, vget_high_s8(vb8), vget_high_s8(va1));
+      vprod2x8 = vmlal_s8(vprod2x8, vget_high_s8(vb8), vget_high_s8(va2));
+      vprod3x8 = vmlal_s8(vprod3x8, vget_high_s8(vb8), vget_high_s8(va3));
+      vacc0x8 = vpadalq_s16(vacc0x8, vprod0x8);
+      vacc1x8 = vpadalq_s16(vacc1x8, vprod1x8);
+      vacc2x8 = vpadalq_s16(vacc2x8, vprod2x8);
+      vacc3x8 = vpadalq_s16(vacc3x8, vprod3x8);
+      int16x8_t vprod0x9 = vmull_s8(vget_low_s8(vb9), vget_low_s8(va0));
+      int16x8_t vprod1x9 = vmull_s8(vget_low_s8(vb9), vget_low_s8(va1));
+      int16x8_t vprod2x9 = vmull_s8(vget_low_s8(vb9), vget_low_s8(va2));
+      int16x8_t vprod3x9 = vmull_s8(vget_low_s8(vb9), vget_low_s8(va3));
+      vprod0x9 = vmlal_s8(vprod0x9, vget_high_s8(vb9), vget_high_s8(va0));
+      vprod1x9 = vmlal_s8(vprod1x9, vget_high_s8(vb9), vget_high_s8(va1));
+      vprod2x9 = vmlal_s8(vprod2x9, vget_high_s8(vb9), vget_high_s8(va2));
+      vprod3x9 = vmlal_s8(vprod3x9, vget_high_s8(vb9), vget_high_s8(va3));
+      vacc0x9 = vpadalq_s16(vacc0x9, vprod0x9);
+      vacc1x9 = vpadalq_s16(vacc1x9, vprod1x9);
+      vacc2x9 = vpadalq_s16(vacc2x9, vprod2x9);
+      vacc3x9 = vpadalq_s16(vacc3x9, vprod3x9);
+      int16x8_t vprod0x10 = vmull_s8(vget_low_s8(vb10), vget_low_s8(va0));
+      int16x8_t vprod1x10 = vmull_s8(vget_low_s8(vb10), vget_low_s8(va1));
+      int16x8_t vprod2x10 = vmull_s8(vget_low_s8(vb10), vget_low_s8(va2));
+      int16x8_t vprod3x10 = vmull_s8(vget_low_s8(vb10), vget_low_s8(va3));
+      vprod0x10 = vmlal_s8(vprod0x10, vget_high_s8(vb10), vget_high_s8(va0));
+      vprod1x10 = vmlal_s8(vprod1x10, vget_high_s8(vb10), vget_high_s8(va1));
+      vprod2x10 = vmlal_s8(vprod2x10, vget_high_s8(vb10), vget_high_s8(va2));
+      vprod3x10 = vmlal_s8(vprod3x10, vget_high_s8(vb10), vget_high_s8(va3));
+      vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10);
+      vacc1x10 = vpadalq_s16(vacc1x10, vprod1x10);
+      vacc2x10 = vpadalq_s16(vacc2x10, vprod2x10);
+      vacc3x10 = vpadalq_s16(vacc3x10, vprod3x10);
+      int16x8_t vprod0x11 = vmull_s8(vget_low_s8(vb11), vget_low_s8(va0));
+      int16x8_t vprod1x11 = vmull_s8(vget_low_s8(vb11), vget_low_s8(va1));
+      int16x8_t vprod2x11 = vmull_s8(vget_low_s8(vb11), vget_low_s8(va2));
+      int16x8_t vprod3x11 = vmull_s8(vget_low_s8(vb11), vget_low_s8(va3));
+      vprod0x11 = vmlal_s8(vprod0x11, vget_high_s8(vb11), vget_high_s8(va0));
+      vprod1x11 = vmlal_s8(vprod1x11, vget_high_s8(vb11), vget_high_s8(va1));
+      vprod2x11 = vmlal_s8(vprod2x11, vget_high_s8(vb11), vget_high_s8(va2));
+      vprod3x11 = vmlal_s8(vprod3x11, vget_high_s8(vb11), vget_high_s8(va3));
+      vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11);
+      vacc1x11 = vpadalq_s16(vacc1x11, vprod1x11);
+      vacc2x11 = vpadalq_s16(vacc2x11, vprod2x11);
+      vacc3x11 = vpadalq_s16(vacc3x11, vprod3x11);
+      int16x8_t vprod0x12 = vmull_s8(vget_low_s8(vb12), vget_low_s8(va0));
+      int16x8_t vprod1x12 = vmull_s8(vget_low_s8(vb12), vget_low_s8(va1));
+      int16x8_t vprod2x12 = vmull_s8(vget_low_s8(vb12), vget_low_s8(va2));
+      int16x8_t vprod3x12 = vmull_s8(vget_low_s8(vb12), vget_low_s8(va3));
+      vprod0x12 = vmlal_s8(vprod0x12, vget_high_s8(vb12), vget_high_s8(va0));
+      vprod1x12 = vmlal_s8(vprod1x12, vget_high_s8(vb12), vget_high_s8(va1));
+      vprod2x12 = vmlal_s8(vprod2x12, vget_high_s8(vb12), vget_high_s8(va2));
+      vprod3x12 = vmlal_s8(vprod3x12, vget_high_s8(vb12), vget_high_s8(va3));
+      vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12);
+      vacc1x12 = vpadalq_s16(vacc1x12, vprod1x12);
+      vacc2x12 = vpadalq_s16(vacc2x12, vprod2x12);
+      vacc3x12 = vpadalq_s16(vacc3x12, vprod3x12);
+      int16x8_t vprod0x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va0));
+      int16x8_t vprod1x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va1));
+      int16x8_t vprod2x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va2));
+      int16x8_t vprod3x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va3));
+      vprod0x13 = vmlal_s8(vprod0x13, vget_high_s8(vb13), vget_high_s8(va0));
+      vprod1x13 = vmlal_s8(vprod1x13, vget_high_s8(vb13), vget_high_s8(va1));
+      vprod2x13 = vmlal_s8(vprod2x13, vget_high_s8(vb13), vget_high_s8(va2));
+      vprod3x13 = vmlal_s8(vprod3x13, vget_high_s8(vb13), vget_high_s8(va3));
+      vacc0x13 = vpadalq_s16(vacc0x13, vprod0x13);
+      vacc1x13 = vpadalq_s16(vacc1x13, vprod1x13);
+      vacc2x13 = vpadalq_s16(vacc2x13, vprod2x13);
+      vacc3x13 = vpadalq_s16(vacc3x13, vprod3x13);
+      int16x8_t vprod0x14 = vmull_s8(vget_low_s8(vb14), vget_low_s8(va0));
+      int16x8_t vprod1x14 = vmull_s8(vget_low_s8(vb14), vget_low_s8(va1));
+      int16x8_t vprod2x14 = vmull_s8(vget_low_s8(vb14), vget_low_s8(va2));
+      int16x8_t vprod3x14 = vmull_s8(vget_low_s8(vb14), vget_low_s8(va3));
+      vprod0x14 = vmlal_s8(vprod0x14, vget_high_s8(vb14), vget_high_s8(va0));
+      vprod1x14 = vmlal_s8(vprod1x14, vget_high_s8(vb14), vget_high_s8(va1));
+      vprod2x14 = vmlal_s8(vprod2x14, vget_high_s8(vb14), vget_high_s8(va2));
+      vprod3x14 = vmlal_s8(vprod3x14, vget_high_s8(vb14), vget_high_s8(va3));
+      vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14);
+      vacc1x14 = vpadalq_s16(vacc1x14, vprod1x14);
+      vacc2x14 = vpadalq_s16(vacc2x14, vprod2x14);
+      vacc3x14 = vpadalq_s16(vacc3x14, vprod3x14);
+      int16x8_t vprod0x15 = vmull_s8(vget_low_s8(vb15), vget_low_s8(va0));
+      int16x8_t vprod1x15 = vmull_s8(vget_low_s8(vb15), vget_low_s8(va1));
+      int16x8_t vprod2x15 = vmull_s8(vget_low_s8(vb15), vget_low_s8(va2));
+      int16x8_t vprod3x15 = vmull_s8(vget_low_s8(vb15), vget_low_s8(va3));
+      vprod0x15 = vmlal_s8(vprod0x15, vget_high_s8(vb15), vget_high_s8(va0));
+      vprod1x15 = vmlal_s8(vprod1x15, vget_high_s8(vb15), vget_high_s8(va1));
+      vprod2x15 = vmlal_s8(vprod2x15, vget_high_s8(vb15), vget_high_s8(va2));
+      vprod3x15 = vmlal_s8(vprod3x15, vget_high_s8(vb15), vget_high_s8(va3));
+      vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15);
+      vacc1x15 = vpadalq_s16(vacc1x15, vprod1x15);
+      vacc2x15 = vpadalq_s16(vacc2x15, vprod2x15);
+      vacc3x15 = vpadalq_s16(vacc3x15, vprod3x15);
+
+      k += 16 * sizeof(int8_t);
+    }
+
+#if XNN_ARCH_ARM64
+    const int32x4_t vsum0x01 = vpaddq_s32(vacc0x0, vacc0x1);
+    const int32x4_t vsum0x23 = vpaddq_s32(vacc0x2, vacc0x3);
+    const int32x4_t vsum0x45 = vpaddq_s32(vacc0x4, vacc0x5);
+    const int32x4_t vsum0x67 = vpaddq_s32(vacc0x6, vacc0x7);
+    const int32x4_t vsum0x89 = vpaddq_s32(vacc0x8, vacc0x9);
+    const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11);
+    const int32x4_t vsum0xCD = vpaddq_s32(vacc0x12, vacc0x13);
+    const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15);
+    const int32x4_t vsum1x01 = vpaddq_s32(vacc1x0, vacc1x1);
+    const int32x4_t vsum1x23 = vpaddq_s32(vacc1x2, vacc1x3);
+    const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5);
+    const int32x4_t vsum1x67 = vpaddq_s32(vacc1x6, vacc1x7);
+    const int32x4_t vsum1x89 = vpaddq_s32(vacc1x8, vacc1x9);
+    const int32x4_t vsum1xAB = vpaddq_s32(vacc1x10, vacc1x11);
+    const int32x4_t vsum1xCD = vpaddq_s32(vacc1x12, vacc1x13);
+    const int32x4_t vsum1xEF = vpaddq_s32(vacc1x14, vacc1x15);
+    const int32x4_t vsum2x01 = vpaddq_s32(vacc2x0, vacc2x1);
+    const int32x4_t vsum2x23 = vpaddq_s32(vacc2x2, vacc2x3);
+    const int32x4_t vsum2x45 = vpaddq_s32(vacc2x4, vacc2x5);
+    const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7);
+    const int32x4_t vsum2x89 = vpaddq_s32(vacc2x8, vacc2x9);
+    const int32x4_t vsum2xAB = vpaddq_s32(vacc2x10, vacc2x11);
+    const int32x4_t vsum2xCD = vpaddq_s32(vacc2x12, vacc2x13);
+    const int32x4_t vsum2xEF = vpaddq_s32(vacc2x14, vacc2x15);
+    const int32x4_t vsum3x01 = vpaddq_s32(vacc3x0, vacc3x1);
+    const int32x4_t vsum3x23 = vpaddq_s32(vacc3x2, vacc3x3);
+    const int32x4_t vsum3x45 = vpaddq_s32(vacc3x4, vacc3x5);
+    const int32x4_t vsum3x67 = vpaddq_s32(vacc3x6, vacc3x7);
+    const int32x4_t vsum3x89 = vpaddq_s32(vacc3x8, vacc3x9);
+    const int32x4_t vsum3xAB = vpaddq_s32(vacc3x10, vacc3x11);
+    const int32x4_t vsum3xCD = vpaddq_s32(vacc3x12, vacc3x13);
+    const int32x4_t vsum3xEF = vpaddq_s32(vacc3x14, vacc3x15);
+    int32x4_t vacc0x0123 = vpaddq_s32(vsum0x01, vsum0x23);
+    int32x4_t vacc0x4567 = vpaddq_s32(vsum0x45, vsum0x67);
+    int32x4_t vacc0x89AB = vpaddq_s32(vsum0x89, vsum0xAB);
+    int32x4_t vacc0xCDEF = vpaddq_s32(vsum0xCD, vsum0xEF);
+    int32x4_t vacc1x0123 = vpaddq_s32(vsum1x01, vsum1x23);
+    int32x4_t vacc1x4567 = vpaddq_s32(vsum1x45, vsum1x67);
+    int32x4_t vacc1x89AB = vpaddq_s32(vsum1x89, vsum1xAB);
+    int32x4_t vacc1xCDEF = vpaddq_s32(vsum1xCD, vsum1xEF);
+    int32x4_t vacc2x0123 = vpaddq_s32(vsum2x01, vsum2x23);
+    int32x4_t vacc2x4567 = vpaddq_s32(vsum2x45, vsum2x67);
+    int32x4_t vacc2x89AB = vpaddq_s32(vsum2x89, vsum2xAB);
+    int32x4_t vacc2xCDEF = vpaddq_s32(vsum2xCD, vsum2xEF);
+    int32x4_t vacc3x0123 = vpaddq_s32(vsum3x01, vsum3x23);
+    int32x4_t vacc3x4567 = vpaddq_s32(vsum3x45, vsum3x67);
+    int32x4_t vacc3x89AB = vpaddq_s32(vsum3x89, vsum3xAB);
+    int32x4_t vacc3xCDEF = vpaddq_s32(vsum3xCD, vsum3xEF);
+#else
+    const int32x2_t vpsum0x0 = vadd_s32(vget_low_s32(vacc0x0), vget_high_s32(vacc0x0));
+    const int32x2_t vpsum0x1 = vadd_s32(vget_low_s32(vacc0x1), vget_high_s32(vacc0x1));
+    const int32x2_t vpsum0x2 = vadd_s32(vget_low_s32(vacc0x2), vget_high_s32(vacc0x2));
+    const int32x2_t vpsum0x3 = vadd_s32(vget_low_s32(vacc0x3), vget_high_s32(vacc0x3));
+    const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1);
+    const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3);
+    int32x4_t vacc0x0123 = vcombine_s32(vsum0x01, vsum0x23 );
+    const int32x2_t vpsum0x4 = vadd_s32(vget_low_s32(vacc0x4), vget_high_s32(vacc0x4));
+    const int32x2_t vpsum0x5 = vadd_s32(vget_low_s32(vacc0x5), vget_high_s32(vacc0x5));
+    const int32x2_t vpsum0x6 = vadd_s32(vget_low_s32(vacc0x6), vget_high_s32(vacc0x6));
+    const int32x2_t vpsum0x7 = vadd_s32(vget_low_s32(vacc0x7), vget_high_s32(vacc0x7));
+    const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5);
+    const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7);
+    int32x4_t vacc0x4567 = vcombine_s32(vsum0x45, vsum0x67 );
+    const int32x2_t vpsum0x8 = vadd_s32(vget_low_s32(vacc0x8), vget_high_s32(vacc0x8));
+    const int32x2_t vpsum0x9 = vadd_s32(vget_low_s32(vacc0x9), vget_high_s32(vacc0x9));
+    const int32x2_t vpsum0xA = vadd_s32(vget_low_s32(vacc0x10), vget_high_s32(vacc0x10));
+    const int32x2_t vpsum0xB = vadd_s32(vget_low_s32(vacc0x11), vget_high_s32(vacc0x11));
+    const int32x2_t vsum0x89 = vpadd_s32(vpsum0x8, vpsum0x9);
+    const int32x2_t vsum0xAB = vpadd_s32(vpsum0xA, vpsum0xB);
+    int32x4_t vacc0x89AB = vcombine_s32(vsum0x89, vsum0xAB );
+    const int32x2_t vpsum0xC = vadd_s32(vget_low_s32(vacc0x12), vget_high_s32(vacc0x12));
+    const int32x2_t vpsum0xD = vadd_s32(vget_low_s32(vacc0x13), vget_high_s32(vacc0x13));
+    const int32x2_t vpsum0xE = vadd_s32(vget_low_s32(vacc0x14), vget_high_s32(vacc0x14));
+    const int32x2_t vpsum0xF = vadd_s32(vget_low_s32(vacc0x15), vget_high_s32(vacc0x15));
+    const int32x2_t vsum0xCD = vpadd_s32(vpsum0xC, vpsum0xD);
+    const int32x2_t vsum0xEF = vpadd_s32(vpsum0xE, vpsum0xF);
+    int32x4_t vacc0xCDEF = vcombine_s32(vsum0xCD, vsum0xEF );
+    const int32x2_t vpsum1x0 = vadd_s32(vget_low_s32(vacc1x0), vget_high_s32(vacc1x0));
+    const int32x2_t vpsum1x1 = vadd_s32(vget_low_s32(vacc1x1), vget_high_s32(vacc1x1));
+    const int32x2_t vpsum1x2 = vadd_s32(vget_low_s32(vacc1x2), vget_high_s32(vacc1x2));
+    const int32x2_t vpsum1x3 = vadd_s32(vget_low_s32(vacc1x3), vget_high_s32(vacc1x3));
+    const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1);
+    const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3);
+    int32x4_t vacc1x0123 = vcombine_s32(vsum1x01, vsum1x23 );
+    const int32x2_t vpsum1x4 = vadd_s32(vget_low_s32(vacc1x4), vget_high_s32(vacc1x4));
+    const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5));
+    const int32x2_t vpsum1x6 = vadd_s32(vget_low_s32(vacc1x6), vget_high_s32(vacc1x6));
+    const int32x2_t vpsum1x7 = vadd_s32(vget_low_s32(vacc1x7), vget_high_s32(vacc1x7));
+    const int32x2_t vsum1x45 = vpadd_s32(vpsum1x4, vpsum1x5);
+    const int32x2_t vsum1x67 = vpadd_s32(vpsum1x6, vpsum1x7);
+    int32x4_t vacc1x4567 = vcombine_s32(vsum1x45, vsum1x67 );
+    const int32x2_t vpsum1x8 = vadd_s32(vget_low_s32(vacc1x8), vget_high_s32(vacc1x8));
+    const int32x2_t vpsum1x9 = vadd_s32(vget_low_s32(vacc1x9), vget_high_s32(vacc1x9));
+    const int32x2_t vpsum1xA = vadd_s32(vget_low_s32(vacc1x10), vget_high_s32(vacc1x10));
+    const int32x2_t vpsum1xB = vadd_s32(vget_low_s32(vacc1x11), vget_high_s32(vacc1x11));
+    const int32x2_t vsum1x89 = vpadd_s32(vpsum1x8, vpsum1x9);
+    const int32x2_t vsum1xAB = vpadd_s32(vpsum1xA, vpsum1xB);
+    int32x4_t vacc1x89AB = vcombine_s32(vsum1x89, vsum1xAB );
+    const int32x2_t vpsum1xC = vadd_s32(vget_low_s32(vacc1x12), vget_high_s32(vacc1x12));
+    const int32x2_t vpsum1xD = vadd_s32(vget_low_s32(vacc1x13), vget_high_s32(vacc1x13));
+    const int32x2_t vpsum1xE = vadd_s32(vget_low_s32(vacc1x14), vget_high_s32(vacc1x14));
+    const int32x2_t vpsum1xF = vadd_s32(vget_low_s32(vacc1x15), vget_high_s32(vacc1x15));
+    const int32x2_t vsum1xCD = vpadd_s32(vpsum1xC, vpsum1xD);
+    const int32x2_t vsum1xEF = vpadd_s32(vpsum1xE, vpsum1xF);
+    int32x4_t vacc1xCDEF = vcombine_s32(vsum1xCD, vsum1xEF );
+    const int32x2_t vpsum2x0 = vadd_s32(vget_low_s32(vacc2x0), vget_high_s32(vacc2x0));
+    const int32x2_t vpsum2x1 = vadd_s32(vget_low_s32(vacc2x1), vget_high_s32(vacc2x1));
+    const int32x2_t vpsum2x2 = vadd_s32(vget_low_s32(vacc2x2), vget_high_s32(vacc2x2));
+    const int32x2_t vpsum2x3 = vadd_s32(vget_low_s32(vacc2x3), vget_high_s32(vacc2x3));
+    const int32x2_t vsum2x01 = vpadd_s32(vpsum2x0, vpsum2x1);
+    const int32x2_t vsum2x23 = vpadd_s32(vpsum2x2, vpsum2x3);
+    int32x4_t vacc2x0123 = vcombine_s32(vsum2x01, vsum2x23 );
+    const int32x2_t vpsum2x4 = vadd_s32(vget_low_s32(vacc2x4), vget_high_s32(vacc2x4));
+    const int32x2_t vpsum2x5 = vadd_s32(vget_low_s32(vacc2x5), vget_high_s32(vacc2x5));
+    const int32x2_t vpsum2x6 = vadd_s32(vget_low_s32(vacc2x6), vget_high_s32(vacc2x6));
+    const int32x2_t vpsum2x7 = vadd_s32(vget_low_s32(vacc2x7), vget_high_s32(vacc2x7));
+    const int32x2_t vsum2x45 = vpadd_s32(vpsum2x4, vpsum2x5);
+    const int32x2_t vsum2x67 = vpadd_s32(vpsum2x6, vpsum2x7);
+    int32x4_t vacc2x4567 = vcombine_s32(vsum2x45, vsum2x67 );
+    const int32x2_t vpsum2x8 = vadd_s32(vget_low_s32(vacc2x8), vget_high_s32(vacc2x8));
+    const int32x2_t vpsum2x9 = vadd_s32(vget_low_s32(vacc2x9), vget_high_s32(vacc2x9));
+    const int32x2_t vpsum2xA = vadd_s32(vget_low_s32(vacc2x10), vget_high_s32(vacc2x10));
+    const int32x2_t vpsum2xB = vadd_s32(vget_low_s32(vacc2x11), vget_high_s32(vacc2x11));
+    const int32x2_t vsum2x89 = vpadd_s32(vpsum2x8, vpsum2x9);
+    const int32x2_t vsum2xAB = vpadd_s32(vpsum2xA, vpsum2xB);
+    int32x4_t vacc2x89AB = vcombine_s32(vsum2x89, vsum2xAB );
+    const int32x2_t vpsum2xC = vadd_s32(vget_low_s32(vacc2x12), vget_high_s32(vacc2x12));
+    const int32x2_t vpsum2xD = vadd_s32(vget_low_s32(vacc2x13), vget_high_s32(vacc2x13));
+    const int32x2_t vpsum2xE = vadd_s32(vget_low_s32(vacc2x14), vget_high_s32(vacc2x14));
+    const int32x2_t vpsum2xF = vadd_s32(vget_low_s32(vacc2x15), vget_high_s32(vacc2x15));
+    const int32x2_t vsum2xCD = vpadd_s32(vpsum2xC, vpsum2xD);
+    const int32x2_t vsum2xEF = vpadd_s32(vpsum2xE, vpsum2xF);
+    int32x4_t vacc2xCDEF = vcombine_s32(vsum2xCD, vsum2xEF );
+    const int32x2_t vpsum3x0 = vadd_s32(vget_low_s32(vacc3x0), vget_high_s32(vacc3x0));
+    const int32x2_t vpsum3x1 = vadd_s32(vget_low_s32(vacc3x1), vget_high_s32(vacc3x1));
+    const int32x2_t vpsum3x2 = vadd_s32(vget_low_s32(vacc3x2), vget_high_s32(vacc3x2));
+    const int32x2_t vpsum3x3 = vadd_s32(vget_low_s32(vacc3x3), vget_high_s32(vacc3x3));
+    const int32x2_t vsum3x01 = vpadd_s32(vpsum3x0, vpsum3x1);
+    const int32x2_t vsum3x23 = vpadd_s32(vpsum3x2, vpsum3x3);
+    int32x4_t vacc3x0123 = vcombine_s32(vsum3x01, vsum3x23 );
+    const int32x2_t vpsum3x4 = vadd_s32(vget_low_s32(vacc3x4), vget_high_s32(vacc3x4));
+    const int32x2_t vpsum3x5 = vadd_s32(vget_low_s32(vacc3x5), vget_high_s32(vacc3x5));
+    const int32x2_t vpsum3x6 = vadd_s32(vget_low_s32(vacc3x6), vget_high_s32(vacc3x6));
+    const int32x2_t vpsum3x7 = vadd_s32(vget_low_s32(vacc3x7), vget_high_s32(vacc3x7));
+    const int32x2_t vsum3x45 = vpadd_s32(vpsum3x4, vpsum3x5);
+    const int32x2_t vsum3x67 = vpadd_s32(vpsum3x6, vpsum3x7);
+    int32x4_t vacc3x4567 = vcombine_s32(vsum3x45, vsum3x67 );
+    const int32x2_t vpsum3x8 = vadd_s32(vget_low_s32(vacc3x8), vget_high_s32(vacc3x8));
+    const int32x2_t vpsum3x9 = vadd_s32(vget_low_s32(vacc3x9), vget_high_s32(vacc3x9));
+    const int32x2_t vpsum3xA = vadd_s32(vget_low_s32(vacc3x10), vget_high_s32(vacc3x10));
+    const int32x2_t vpsum3xB = vadd_s32(vget_low_s32(vacc3x11), vget_high_s32(vacc3x11));
+    const int32x2_t vsum3x89 = vpadd_s32(vpsum3x8, vpsum3x9);
+    const int32x2_t vsum3xAB = vpadd_s32(vpsum3xA, vpsum3xB);
+    int32x4_t vacc3x89AB = vcombine_s32(vsum3x89, vsum3xAB );
+    const int32x2_t vpsum3xC = vadd_s32(vget_low_s32(vacc3x12), vget_high_s32(vacc3x12));
+    const int32x2_t vpsum3xD = vadd_s32(vget_low_s32(vacc3x13), vget_high_s32(vacc3x13));
+    const int32x2_t vpsum3xE = vadd_s32(vget_low_s32(vacc3x14), vget_high_s32(vacc3x14));
+    const int32x2_t vpsum3xF = vadd_s32(vget_low_s32(vacc3x15), vget_high_s32(vacc3x15));
+    const int32x2_t vsum3xCD = vpadd_s32(vpsum3xC, vpsum3xD);
+    const int32x2_t vsum3xEF = vpadd_s32(vpsum3xE, vpsum3xF);
+    int32x4_t vacc3xCDEF = vcombine_s32(vsum3xCD, vsum3xEF );
+#endif
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+    vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc1x89AB = vqrdmulhq_s32(vacc1x89AB, vmultiplier);
+    vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+    vacc2x89AB = vqrdmulhq_s32(vacc2x89AB, vmultiplier);
+    vacc2xCDEF = vqrdmulhq_s32(vacc2xCDEF, vmultiplier);
+    vacc3x0123 = vqrdmulhq_s32(vacc3x0123, vmultiplier);
+    vacc3x4567 = vqrdmulhq_s32(vacc3x4567, vmultiplier);
+    vacc3x89AB = vqrdmulhq_s32(vacc3x89AB, vmultiplier);
+    vacc3xCDEF = vqrdmulhq_s32(vacc3xCDEF, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+    vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc1x89AB = vsraq_n_s32(vacc1x89AB, vbicq_s32(vacc1x89AB, vzero_shift_mask), 31);
+    vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+    vacc2x89AB = vsraq_n_s32(vacc2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31);
+    vacc2xCDEF = vsraq_n_s32(vacc2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31);
+    vacc3x0123 = vsraq_n_s32(vacc3x0123, vbicq_s32(vacc3x0123, vzero_shift_mask), 31);
+    vacc3x4567 = vsraq_n_s32(vacc3x4567, vbicq_s32(vacc3x4567, vzero_shift_mask), 31);
+    vacc3x89AB = vsraq_n_s32(vacc3x89AB, vbicq_s32(vacc3x89AB, vzero_shift_mask), 31);
+    vacc3xCDEF = vsraq_n_s32(vacc3xCDEF, vbicq_s32(vacc3xCDEF, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_shift);
+    vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+    vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_shift);
+    vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_shift);
+    vacc3x0123 = vrshlq_s32(vacc3x0123, vright_shift);
+    vacc3x4567 = vrshlq_s32(vacc3x4567, vright_shift);
+    vacc3x89AB = vrshlq_s32(vacc3x89AB, vright_shift);
+    vacc3xCDEF = vrshlq_s32(vacc3xCDEF, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
+    const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), voutput_zero_point);
+    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+    int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
+    int8x16_t vout2x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc2x89ABCDEF);
+    int8x16_t vout3x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc3x01234567), vacc3x89ABCDEF);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567)), voutput_zero_point);
+    const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF)), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+    int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
+    int8x16_t vout2x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc2x89ABCDEF));
+    int8x16_t vout3x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc3x01234567), vqmovn_s16(vacc3x89ABCDEF));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+    vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
+    vout2x0123456789ABCDEF = vmaxq_s8(vout2x0123456789ABCDEF, voutput_min);
+    vout3x0123456789ABCDEF = vmaxq_s8(vout3x0123456789ABCDEF, voutput_min);
+
+    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+    vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
+    vout2x0123456789ABCDEF = vminq_s8(vout2x0123456789ABCDEF, voutput_max);
+    vout3x0123456789ABCDEF = vminq_s8(vout3x0123456789ABCDEF, voutput_max);
+
+    if (nc >= 16) {
+      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+      vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
+      vst1q_s8(c2 + 0, vout2x0123456789ABCDEF);
+      vst1q_s8(c3 + 0, vout3x0123456789ABCDEF);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+
+      nc -= 16;
+    } else {
+      int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
+      int8x16_t vout2x01234567_3x01234567 = vcombine_s8(vget_low_s8(vout2x0123456789ABCDEF), vget_low_s8(vout3x0123456789ABCDEF));
+      if (nc & 8) {
+        vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
+        vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
+        vst1_s8(c2, vget_low_s8(vout2x01234567_3x01234567)); c2 += 8;
+        vst1_s8(c3, vget_high_s8(vout2x01234567_3x01234567)); c3 += 8;
+        vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
+        vout2x01234567_3x01234567 = vcombine_s8(vget_high_s8(vout2x0123456789ABCDEF), vget_high_s8(vout3x0123456789ABCDEF));
+      }
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 0); c2 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c3, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 2); c3 += 4;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 0); c2 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c3, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 4); c3 += 2;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c2, vout2x01234567_3x01234567, 0);
+        vst1q_lane_s8(c3, vout2x01234567_3x01234567, 8);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/4x16c2-minmax-neon-mlal-padal-dup.c b/src/qs8-gemm/gen/4x16c2-minmax-neon-mlal-padal-dup.c
new file mode 100644
index 0000000..c8685b2
--- /dev/null
+++ b/src/qs8-gemm/gen/4x16c2-minmax-neon-mlal-padal-dup.c
@@ -0,0 +1,750 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/c2-neon-mull-padal-dup.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride);
+  int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc1x0123 = vacc0x0123;
+    int32x4_t vacc1x4567 = vacc0x4567;
+    int32x4_t vacc1x89AB = vacc0x89AB;
+    int32x4_t vacc1xCDEF = vacc0xCDEF;
+    int32x4_t vacc2x0123 = vacc0x0123;
+    int32x4_t vacc2x4567 = vacc0x4567;
+    int32x4_t vacc2x89AB = vacc0x89AB;
+    int32x4_t vacc2xCDEF = vacc0xCDEF;
+    int32x4_t vacc3x0123 = vacc0x0123;
+    int32x4_t vacc3x4567 = vacc0x4567;
+    int32x4_t vacc3x89AB = vacc0x89AB;
+    int32x4_t vacc3xCDEF = vacc0xCDEF;
+
+    size_t k = kc;
+
+    while (k >= 16 * sizeof(int8_t)) {
+      const int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va0x1 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va1x0 = vld1_s8(a1); a1 += 8;
+      const int8x8_t va1x1 = vld1_s8(a1); a1 += 8;
+      const int8x8_t va2x0 = vld1_s8(a2); a2 += 8;
+      const int8x8_t va2x1 = vld1_s8(a2); a2 += 8;
+      const int8x8_t va3x0 = vld1_s8(a3); a3 += 8;
+      const int8x8_t va3x1 = vld1_s8(a3); a3 += 8;
+
+      const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+      int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+      int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0)));
+      int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 0)));
+      const int8x8_t vb0123c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+      vprod1x0123c0 = vmlal_s8(vprod1x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+      vprod2x0123c0 = vmlal_s8(vprod2x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 0)));
+      vprod3x0123c0 = vmlal_s8(vprod3x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 0)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+      vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c0);
+      int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+      int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+      int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0)));
+      int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 0)));
+      const int8x8_t vb4567c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+      vprod1x4567c0 = vmlal_s8(vprod1x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+      vprod2x4567c0 = vmlal_s8(vprod2x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 0)));
+      vprod3x4567c0 = vmlal_s8(vprod3x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 0)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+      vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c0);
+      int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+      int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+      int16x8_t vprod2x89ABc0 = vmull_s8(vb89ABc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0)));
+      int16x8_t vprod3x89ABc0 = vmull_s8(vb89ABc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 0)));
+      const int8x8_t vb89ABc0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x89ABc0 = vmlal_s8(vprod0x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+      vprod1x89ABc0 = vmlal_s8(vprod1x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+      vprod2x89ABc0 = vmlal_s8(vprod2x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 0)));
+      vprod3x89ABc0 = vmlal_s8(vprod3x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 0)));
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+      vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
+      vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0);
+      vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc0);
+      int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+      int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+      int16x8_t vprod2xCDEFc0 = vmull_s8(vbCDEFc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0)));
+      int16x8_t vprod3xCDEFc0 = vmull_s8(vbCDEFc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 0)));
+      const int8x8_t vbCDEFc0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0xCDEFc0 = vmlal_s8(vprod0xCDEFc0, vbCDEFc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+      vprod1xCDEFc0 = vmlal_s8(vprod1xCDEFc0, vbCDEFc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+      vprod2xCDEFc0 = vmlal_s8(vprod2xCDEFc0, vbCDEFc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 0)));
+      vprod3xCDEFc0 = vmlal_s8(vprod3xCDEFc0, vbCDEFc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 0)));
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+      vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
+      vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0);
+      vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc0);
+      int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+      int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+      int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1)));
+      int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 1)));
+      const int8x8_t vb0123c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x0123c1 = vmlal_s8(vprod0x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+      vprod1x0123c1 = vmlal_s8(vprod1x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+      vprod2x0123c1 = vmlal_s8(vprod2x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 1)));
+      vprod3x0123c1 = vmlal_s8(vprod3x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 1)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+      vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c1);
+      int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+      int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+      int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1)));
+      int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 1)));
+      const int8x8_t vb4567c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x4567c1 = vmlal_s8(vprod0x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+      vprod1x4567c1 = vmlal_s8(vprod1x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+      vprod2x4567c1 = vmlal_s8(vprod2x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 1)));
+      vprod3x4567c1 = vmlal_s8(vprod3x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 1)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+      vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c1);
+      int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+      int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+      int16x8_t vprod2x89ABc1 = vmull_s8(vb89ABc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1)));
+      int16x8_t vprod3x89ABc1 = vmull_s8(vb89ABc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 1)));
+      const int8x8_t vb89ABc1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x89ABc1 = vmlal_s8(vprod0x89ABc1, vb89ABc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+      vprod1x89ABc1 = vmlal_s8(vprod1x89ABc1, vb89ABc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+      vprod2x89ABc1 = vmlal_s8(vprod2x89ABc1, vb89ABc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 1)));
+      vprod3x89ABc1 = vmlal_s8(vprod3x89ABc1, vb89ABc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 1)));
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+      vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
+      vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1);
+      vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc1);
+      int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+      int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+      int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1)));
+      int16x8_t vprod3xCDEFc1 = vmull_s8(vbCDEFc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 1)));
+      const int8x8_t vbCDEFc1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0xCDEFc1 = vmlal_s8(vprod0xCDEFc1, vbCDEFc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+      vprod1xCDEFc1 = vmlal_s8(vprod1xCDEFc1, vbCDEFc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+      vprod2xCDEFc1 = vmlal_s8(vprod2xCDEFc1, vbCDEFc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 1)));
+      vprod3xCDEFc1 = vmlal_s8(vprod3xCDEFc1, vbCDEFc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 1)));
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+      vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
+      vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1);
+      vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc1);
+      int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+      int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+      int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 2)));
+      int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 2)));
+      const int8x8_t vb0123c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x0123c2 = vmlal_s8(vprod0x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+      vprod1x0123c2 = vmlal_s8(vprod1x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+      vprod2x0123c2 = vmlal_s8(vprod2x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 2)));
+      vprod3x0123c2 = vmlal_s8(vprod3x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 2)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+      vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c2);
+      int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+      int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+      int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 2)));
+      int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 2)));
+      const int8x8_t vb4567c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x4567c2 = vmlal_s8(vprod0x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+      vprod1x4567c2 = vmlal_s8(vprod1x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+      vprod2x4567c2 = vmlal_s8(vprod2x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 2)));
+      vprod3x4567c2 = vmlal_s8(vprod3x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 2)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+      vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c2);
+      int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+      int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+      int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 2)));
+      int16x8_t vprod3x89ABc2 = vmull_s8(vb89ABc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 2)));
+      const int8x8_t vb89ABc2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x89ABc2 = vmlal_s8(vprod0x89ABc2, vb89ABc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+      vprod1x89ABc2 = vmlal_s8(vprod1x89ABc2, vb89ABc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+      vprod2x89ABc2 = vmlal_s8(vprod2x89ABc2, vb89ABc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 2)));
+      vprod3x89ABc2 = vmlal_s8(vprod3x89ABc2, vb89ABc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 2)));
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+      vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
+      vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2);
+      vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc2);
+      int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+      int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+      int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 2)));
+      int16x8_t vprod3xCDEFc2 = vmull_s8(vbCDEFc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 2)));
+      const int8x8_t vbCDEFc2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0xCDEFc2 = vmlal_s8(vprod0xCDEFc2, vbCDEFc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+      vprod1xCDEFc2 = vmlal_s8(vprod1xCDEFc2, vbCDEFc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+      vprod2xCDEFc2 = vmlal_s8(vprod2xCDEFc2, vbCDEFc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 2)));
+      vprod3xCDEFc2 = vmlal_s8(vprod3xCDEFc2, vbCDEFc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 2)));
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+      vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
+      vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2);
+      vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc2);
+      int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+      int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+      int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 3)));
+      int16x8_t vprod3x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 3)));
+      const int8x8_t vb0123c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x0123c3 = vmlal_s8(vprod0x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+      vprod1x0123c3 = vmlal_s8(vprod1x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+      vprod2x0123c3 = vmlal_s8(vprod2x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 3)));
+      vprod3x0123c3 = vmlal_s8(vprod3x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 3)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
+      vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c3);
+      int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+      int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+      int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 3)));
+      int16x8_t vprod3x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 3)));
+      const int8x8_t vb4567c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x4567c3 = vmlal_s8(vprod0x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+      vprod1x4567c3 = vmlal_s8(vprod1x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+      vprod2x4567c3 = vmlal_s8(vprod2x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 3)));
+      vprod3x4567c3 = vmlal_s8(vprod3x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 3)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
+      vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c3);
+      int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+      int16x8_t vprod1x89ABc3 = vmull_s8(vb89ABc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+      int16x8_t vprod2x89ABc3 = vmull_s8(vb89ABc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 3)));
+      int16x8_t vprod3x89ABc3 = vmull_s8(vb89ABc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 3)));
+      const int8x8_t vb89ABc3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x89ABc3 = vmlal_s8(vprod0x89ABc3, vb89ABc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+      vprod1x89ABc3 = vmlal_s8(vprod1x89ABc3, vb89ABc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+      vprod2x89ABc3 = vmlal_s8(vprod2x89ABc3, vb89ABc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 3)));
+      vprod3x89ABc3 = vmlal_s8(vprod3x89ABc3, vb89ABc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 3)));
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
+      vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc3);
+      vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3);
+      vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc3);
+      int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+      int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+      int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 3)));
+      int16x8_t vprod3xCDEFc3 = vmull_s8(vbCDEFc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 3)));
+      const int8x8_t vbCDEFc3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0xCDEFc3 = vmlal_s8(vprod0xCDEFc3, vbCDEFc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+      vprod1xCDEFc3 = vmlal_s8(vprod1xCDEFc3, vbCDEFc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+      vprod2xCDEFc3 = vmlal_s8(vprod2xCDEFc3, vbCDEFc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 3)));
+      vprod3xCDEFc3 = vmlal_s8(vprod3xCDEFc3, vbCDEFc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 3)));
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
+      vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3);
+      vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3);
+      vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc3);
+
+      k -= 16 * sizeof(int8_t);
+    }
+
+    if (k >= 8 * sizeof(int8_t)) {
+      const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+      const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+      const int8x8_t va3 = vld1_s8(a3); a3 += 8;
+
+      const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+      const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+      const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+      const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+      const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+      const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+      const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+      const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+      const int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
+      const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+      const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+      const int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
+      const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+      const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+      const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+      const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+      const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+      const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+      const int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      const int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+      const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+      const int16x8_t vprod1x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+      vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
+      vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
+      vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
+      vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc3);
+      const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+      const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+      const int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+      vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
+      vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
+      vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
+      vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3);
+      const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+      const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+      const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+      const int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
+      const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+      const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+      const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+      const int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
+      const int16x8_t vprod2x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+      const int16x8_t vprod2x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+      const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+      const int16x8_t vprod2x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+      vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0);
+      vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1);
+      vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2);
+      vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3);
+      const int16x8_t vprod2xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+      const int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+      const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+      const int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+      vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0);
+      vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1);
+      vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2);
+      vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3);
+      const int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+      const int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+      const int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+      const int16x8_t vprod3x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
+      vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c0);
+      vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c1);
+      vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c2);
+      vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c3);
+      const int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+      const int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+      const int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+      const int16x8_t vprod3x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
+      vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c0);
+      vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c1);
+      vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c2);
+      vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c3);
+      const int16x8_t vprod3x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+      const int16x8_t vprod3x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+      const int16x8_t vprod3x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+      const int16x8_t vprod3x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
+      vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc0);
+      vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc1);
+      vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc2);
+      vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc3);
+      const int16x8_t vprod3xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+      const int16x8_t vprod3xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+      const int16x8_t vprod3xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+      const int16x8_t vprod3xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
+      vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc0);
+      vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc1);
+      vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc2);
+      vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc3);
+
+      k -= 8 * sizeof(int8_t);
+    }
+
+    if XNN_UNLIKELY(k != 0) {
+      const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+      const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+      const int8x8_t va2 = vld1_s8(a2); a2 = (const int8_t*) ((uintptr_t) a2 + k);
+      const int8x8_t va3 = vld1_s8(a3); a3 = (const int8_t*) ((uintptr_t) a3 + k);
+
+      const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+      const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+      const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+      const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+      const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+      const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+      const int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
+      const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
+      const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+      const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+      const int16x8_t vprod2x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+      vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0);
+      const int16x8_t vprod2xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+      vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0);
+      const int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+      vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c0);
+      const int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+      vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c0);
+      const int16x8_t vprod3x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+      vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc0);
+      const int16x8_t vprod3xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+      vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc0);
+
+      if (k > 2 * sizeof(int8_t)) {
+        const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+        const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+        const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+        const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+        const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+        const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+        const int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
+        const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
+        const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+        const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+        const int16x8_t vprod2x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+        vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1);
+        const int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+        vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1);
+        const int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+        vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c1);
+        const int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+        vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c1);
+        const int16x8_t vprod3x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+        vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc1);
+        const int16x8_t vprod3xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+        vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc1);
+
+        if (k > 4 * sizeof(int8_t)) {
+          const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+          vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+          const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+          vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+          const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+          vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+          const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+          vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+          const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+          vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+          const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+          vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+          const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+          vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
+          const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+          vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
+          const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+          vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+          const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+          vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+          const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+          vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2);
+          const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+          vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2);
+          const int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+          vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c2);
+          const int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+          vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c2);
+          const int16x8_t vprod3x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+          vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc2);
+          const int16x8_t vprod3xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+          vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc2);
+        }
+      }
+    }
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+    vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc1x89AB = vqrdmulhq_s32(vacc1x89AB, vmultiplier);
+    vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+    vacc2x89AB = vqrdmulhq_s32(vacc2x89AB, vmultiplier);
+    vacc2xCDEF = vqrdmulhq_s32(vacc2xCDEF, vmultiplier);
+    vacc3x0123 = vqrdmulhq_s32(vacc3x0123, vmultiplier);
+    vacc3x4567 = vqrdmulhq_s32(vacc3x4567, vmultiplier);
+    vacc3x89AB = vqrdmulhq_s32(vacc3x89AB, vmultiplier);
+    vacc3xCDEF = vqrdmulhq_s32(vacc3xCDEF, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+    vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc1x89AB = vsraq_n_s32(vacc1x89AB, vbicq_s32(vacc1x89AB, vzero_shift_mask), 31);
+    vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+    vacc2x89AB = vsraq_n_s32(vacc2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31);
+    vacc2xCDEF = vsraq_n_s32(vacc2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31);
+    vacc3x0123 = vsraq_n_s32(vacc3x0123, vbicq_s32(vacc3x0123, vzero_shift_mask), 31);
+    vacc3x4567 = vsraq_n_s32(vacc3x4567, vbicq_s32(vacc3x4567, vzero_shift_mask), 31);
+    vacc3x89AB = vsraq_n_s32(vacc3x89AB, vbicq_s32(vacc3x89AB, vzero_shift_mask), 31);
+    vacc3xCDEF = vsraq_n_s32(vacc3xCDEF, vbicq_s32(vacc3xCDEF, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_shift);
+    vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+    vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_shift);
+    vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_shift);
+    vacc3x0123 = vrshlq_s32(vacc3x0123, vright_shift);
+    vacc3x4567 = vrshlq_s32(vacc3x4567, vright_shift);
+    vacc3x89AB = vrshlq_s32(vacc3x89AB, vright_shift);
+    vacc3xCDEF = vrshlq_s32(vacc3xCDEF, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
+    const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+    int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
+    int8x16_t vout2x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc2x89ABCDEF);
+    int8x16_t vout3x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc3x01234567), vacc3x89ABCDEF);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567)), voutput_zero_point);
+    const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF)), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+    int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
+    int8x16_t vout2x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc2x89ABCDEF));
+    int8x16_t vout3x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc3x01234567), vqmovn_s16(vacc3x89ABCDEF));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+    vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
+    vout2x0123456789ABCDEF = vmaxq_s8(vout2x0123456789ABCDEF, voutput_min);
+    vout3x0123456789ABCDEF = vmaxq_s8(vout3x0123456789ABCDEF, voutput_min);
+
+    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+    vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
+    vout2x0123456789ABCDEF = vminq_s8(vout2x0123456789ABCDEF, voutput_max);
+    vout3x0123456789ABCDEF = vminq_s8(vout3x0123456789ABCDEF, voutput_max);
+
+    if (nc >= 16) {
+      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+      vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
+      vst1q_s8(c2 + 0, vout2x0123456789ABCDEF);
+      vst1q_s8(c3 + 0, vout3x0123456789ABCDEF);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+
+      nc -= 16;
+    } else {
+      int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
+      int8x16_t vout2x01234567_3x01234567 = vcombine_s8(vget_low_s8(vout2x0123456789ABCDEF), vget_low_s8(vout3x0123456789ABCDEF));
+      if (nc & 8) {
+        vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
+        vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
+        vst1_s8(c2, vget_low_s8(vout2x01234567_3x01234567)); c2 += 8;
+        vst1_s8(c3, vget_high_s8(vout2x01234567_3x01234567)); c3 += 8;
+        vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
+        vout2x01234567_3x01234567 = vcombine_s8(vget_high_s8(vout2x0123456789ABCDEF), vget_high_s8(vout3x0123456789ABCDEF));
+      }
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 0); c2 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c3, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 2); c3 += 4;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 0); c2 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c3, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 4); c3 += 2;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c2, vout2x01234567_3x01234567, 0);
+        vst1q_lane_s8(c3, vout2x01234567_3x01234567, 8);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/4x16c2-minmax-neon-mull-padal-dup.c b/src/qs8-gemm/gen/4x16c2-minmax-neon-mull-padal-dup.c
new file mode 100644
index 0000000..ea9270c
--- /dev/null
+++ b/src/qs8-gemm/gen/4x16c2-minmax-neon-mull-padal-dup.c
@@ -0,0 +1,512 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/c2-neon-mull-padal-dup.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride);
+  int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc1x0123 = vacc0x0123;
+    int32x4_t vacc1x4567 = vacc0x4567;
+    int32x4_t vacc1x89AB = vacc0x89AB;
+    int32x4_t vacc1xCDEF = vacc0xCDEF;
+    int32x4_t vacc2x0123 = vacc0x0123;
+    int32x4_t vacc2x4567 = vacc0x4567;
+    int32x4_t vacc2x89AB = vacc0x89AB;
+    int32x4_t vacc2xCDEF = vacc0xCDEF;
+    int32x4_t vacc3x0123 = vacc0x0123;
+    int32x4_t vacc3x4567 = vacc0x4567;
+    int32x4_t vacc3x89AB = vacc0x89AB;
+    int32x4_t vacc3xCDEF = vacc0xCDEF;
+
+    size_t k = kc;
+
+
+    while (k >= 8 * sizeof(int8_t)) {
+      const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+      const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+      const int8x8_t va3 = vld1_s8(a3); a3 += 8;
+
+      const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+      const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+      const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+      const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+      const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+      const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+      const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+      const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+      const int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
+      const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+      const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+      const int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
+      const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+      const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+      const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+      const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+      const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+      const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+      const int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      const int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+      const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+      const int16x8_t vprod1x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+      vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
+      vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
+      vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
+      vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc3);
+      const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+      const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+      const int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+      vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
+      vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
+      vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
+      vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3);
+      const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+      const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+      const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+      const int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
+      const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+      const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+      const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+      const int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
+      const int16x8_t vprod2x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+      const int16x8_t vprod2x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+      const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+      const int16x8_t vprod2x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+      vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0);
+      vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1);
+      vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2);
+      vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3);
+      const int16x8_t vprod2xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+      const int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+      const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+      const int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+      vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0);
+      vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1);
+      vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2);
+      vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3);
+      const int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+      const int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+      const int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+      const int16x8_t vprod3x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
+      vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c0);
+      vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c1);
+      vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c2);
+      vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c3);
+      const int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+      const int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+      const int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+      const int16x8_t vprod3x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
+      vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c0);
+      vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c1);
+      vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c2);
+      vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c3);
+      const int16x8_t vprod3x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+      const int16x8_t vprod3x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+      const int16x8_t vprod3x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+      const int16x8_t vprod3x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
+      vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc0);
+      vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc1);
+      vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc2);
+      vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc3);
+      const int16x8_t vprod3xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+      const int16x8_t vprod3xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+      const int16x8_t vprod3xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+      const int16x8_t vprod3xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
+      vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc0);
+      vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc1);
+      vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc2);
+      vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc3);
+
+      k -= 8 * sizeof(int8_t);
+    }
+
+    if XNN_UNLIKELY(k != 0) {
+      const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+      const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+      const int8x8_t va2 = vld1_s8(a2); a2 = (const int8_t*) ((uintptr_t) a2 + k);
+      const int8x8_t va3 = vld1_s8(a3); a3 = (const int8_t*) ((uintptr_t) a3 + k);
+
+      const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+      const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+      const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+      const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+      const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+      const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+      const int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
+      const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
+      const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+      const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+      const int16x8_t vprod2x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+      vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0);
+      const int16x8_t vprod2xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+      vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0);
+      const int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+      vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c0);
+      const int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+      vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c0);
+      const int16x8_t vprod3x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+      vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc0);
+      const int16x8_t vprod3xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+      vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc0);
+
+      if (k > 2 * sizeof(int8_t)) {
+        const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+        const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+        const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+        const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+        const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+        const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+        const int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
+        const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
+        const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+        const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+        const int16x8_t vprod2x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+        vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1);
+        const int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+        vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1);
+        const int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+        vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c1);
+        const int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+        vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c1);
+        const int16x8_t vprod3x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+        vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc1);
+        const int16x8_t vprod3xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+        vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc1);
+
+        if (k > 4 * sizeof(int8_t)) {
+          const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+          vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+          const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+          vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+          const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+          vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+          const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+          vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+          const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+          vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+          const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+          vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+          const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+          vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
+          const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+          vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
+          const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+          vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+          const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+          vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+          const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+          vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2);
+          const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+          vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2);
+          const int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+          vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c2);
+          const int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+          vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c2);
+          const int16x8_t vprod3x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+          vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc2);
+          const int16x8_t vprod3xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+          vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc2);
+        }
+      }
+    }
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+    vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc1x89AB = vqrdmulhq_s32(vacc1x89AB, vmultiplier);
+    vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+    vacc2x89AB = vqrdmulhq_s32(vacc2x89AB, vmultiplier);
+    vacc2xCDEF = vqrdmulhq_s32(vacc2xCDEF, vmultiplier);
+    vacc3x0123 = vqrdmulhq_s32(vacc3x0123, vmultiplier);
+    vacc3x4567 = vqrdmulhq_s32(vacc3x4567, vmultiplier);
+    vacc3x89AB = vqrdmulhq_s32(vacc3x89AB, vmultiplier);
+    vacc3xCDEF = vqrdmulhq_s32(vacc3xCDEF, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+    vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc1x89AB = vsraq_n_s32(vacc1x89AB, vbicq_s32(vacc1x89AB, vzero_shift_mask), 31);
+    vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+    vacc2x89AB = vsraq_n_s32(vacc2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31);
+    vacc2xCDEF = vsraq_n_s32(vacc2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31);
+    vacc3x0123 = vsraq_n_s32(vacc3x0123, vbicq_s32(vacc3x0123, vzero_shift_mask), 31);
+    vacc3x4567 = vsraq_n_s32(vacc3x4567, vbicq_s32(vacc3x4567, vzero_shift_mask), 31);
+    vacc3x89AB = vsraq_n_s32(vacc3x89AB, vbicq_s32(vacc3x89AB, vzero_shift_mask), 31);
+    vacc3xCDEF = vsraq_n_s32(vacc3xCDEF, vbicq_s32(vacc3xCDEF, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_shift);
+    vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+    vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_shift);
+    vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_shift);
+    vacc3x0123 = vrshlq_s32(vacc3x0123, vright_shift);
+    vacc3x4567 = vrshlq_s32(vacc3x4567, vright_shift);
+    vacc3x89AB = vrshlq_s32(vacc3x89AB, vright_shift);
+    vacc3xCDEF = vrshlq_s32(vacc3xCDEF, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
+    const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+    int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
+    int8x16_t vout2x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc2x89ABCDEF);
+    int8x16_t vout3x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc3x01234567), vacc3x89ABCDEF);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567)), voutput_zero_point);
+    const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF)), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+    int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
+    int8x16_t vout2x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc2x89ABCDEF));
+    int8x16_t vout3x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc3x01234567), vqmovn_s16(vacc3x89ABCDEF));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+    vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
+    vout2x0123456789ABCDEF = vmaxq_s8(vout2x0123456789ABCDEF, voutput_min);
+    vout3x0123456789ABCDEF = vmaxq_s8(vout3x0123456789ABCDEF, voutput_min);
+
+    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+    vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
+    vout2x0123456789ABCDEF = vminq_s8(vout2x0123456789ABCDEF, voutput_max);
+    vout3x0123456789ABCDEF = vminq_s8(vout3x0123456789ABCDEF, voutput_max);
+
+    if (nc >= 16) {
+      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+      vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
+      vst1q_s8(c2 + 0, vout2x0123456789ABCDEF);
+      vst1q_s8(c3 + 0, vout3x0123456789ABCDEF);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+
+      nc -= 16;
+    } else {
+      int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
+      int8x16_t vout2x01234567_3x01234567 = vcombine_s8(vget_low_s8(vout2x0123456789ABCDEF), vget_low_s8(vout3x0123456789ABCDEF));
+      if (nc & 8) {
+        vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
+        vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
+        vst1_s8(c2, vget_low_s8(vout2x01234567_3x01234567)); c2 += 8;
+        vst1_s8(c3, vget_high_s8(vout2x01234567_3x01234567)); c3 += 8;
+        vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
+        vout2x01234567_3x01234567 = vcombine_s8(vget_high_s8(vout2x0123456789ABCDEF), vget_high_s8(vout3x0123456789ABCDEF));
+      }
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 0); c2 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c3, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 2); c3 += 4;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 0); c2 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c3, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 4); c3 += 2;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c2, vout2x01234567_3x01234567, 0);
+        vst1q_lane_s8(c3, vout2x01234567_3x01234567, 8);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/4x16c4-minmax-neondot.c b/src/qs8-gemm/gen/4x16c4-minmax-neondot.c
index f651597..3c93b58 100644
--- a/src/qs8-gemm/gen/4x16c4-minmax-neondot.c
+++ b/src/qs8-gemm/gen/4x16c4-minmax-neondot.c
@@ -7,18 +7,14 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-
 #include <assert.h>
 
 #include <arm_neon.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
-// This kernel uses ARMv8.2 dot-product instructions.
-//
-// Scalar model: xnn_qs8_gemm_minmax_ukernel_4x16c4__scalar. Refer to
-// that kernel for more comments.
 void xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot(
     size_t mr,
     size_t nc,
@@ -34,7 +30,12 @@
   assert(mr <= 4);
   assert(nc != 0);
   assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
 
+  kc = round_up_po2(kc, 4);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -88,14 +89,14 @@
       const int8x8_t va3x01234567 = vld1_s8(a3); a3 += 8;
 
       // Load a 8x16 block of weights.
-      const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb0123x89AB = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb0123xCDEF = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb4567x89AB = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb4567xCDEF = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
+      const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb0123x89AB = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb0123xCDEF = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb4567x89AB = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb4567xCDEF = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
 
       // Multiply-accumulate: 4x8 * 8x16 --> 4x16.
       vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb0123x0123, va0x01234567, 0);
@@ -133,19 +134,19 @@
 
       k -= 8 * sizeof(int8_t);
     }
-    // Handle up to 7 final positions of `k`
+    // Handle up to 4 final positions of `k`
     if XNN_UNLIKELY(k != 0) {
       // Load a 4x4 block of activations.
-      const int8x8_t va0x01234567 = vld1_s8(a0); a0 += k;
-      const int8x8_t va1x01234567 = vld1_s8(a1); a1 += k;
-      const int8x8_t va2x01234567 = vld1_s8(a2); a2 += k;
-      const int8x8_t va3x01234567 = vld1_s8(a3); a3 += k;
+      const int8x8_t va0x01234567 = vld1_s8(a0); a0 += 4;
+      const int8x8_t va1x01234567 = vld1_s8(a1); a1 += 4;
+      const int8x8_t va2x01234567 = vld1_s8(a2); a2 += 4;
+      const int8x8_t va3x01234567 = vld1_s8(a3); a3 += 4;
 
       // Load a 4x16 block of weights.
-      const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb0123x89AB = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb0123xCDEF = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
+      const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb0123x89AB = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb0123xCDEF = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
 
       // Multiply-accumulate: 4x4 * 4x16 --> 4x16.
       vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb0123x0123, va0x01234567, 0);
@@ -164,42 +165,9 @@
       vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb0123x4567, va3x01234567, 0);
       vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb0123x89AB, va3x01234567, 0);
       vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb0123xCDEF, va3x01234567, 0);
-
-      if (k > 4) {
-        // Load a 4x16 block of weights.
-        const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb4567x89AB = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb4567xCDEF = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-
-        // Multiply-accumulate: 4x4 * 4x16 --> 4x16.
-        vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
-        vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
-        vacc0x89AB = vdotq_lane_s32(vacc0x89AB, vb4567x89AB, va0x01234567, 1);
-        vacc0xCDEF = vdotq_lane_s32(vacc0xCDEF, vb4567xCDEF, va0x01234567, 1);
-        vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1);
-        vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1);
-        vacc1x89AB = vdotq_lane_s32(vacc1x89AB, vb4567x89AB, va1x01234567, 1);
-        vacc1xCDEF = vdotq_lane_s32(vacc1xCDEF, vb4567xCDEF, va1x01234567, 1);
-        vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1);
-        vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1);
-        vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb4567x89AB, va2x01234567, 1);
-        vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb4567xCDEF, va2x01234567, 1);
-        vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb4567x0123, va3x01234567, 1);
-        vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb4567x4567, va3x01234567, 1);
-        vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb4567x89AB, va3x01234567, 1);
-        vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb4567xCDEF, va3x01234567, 1);
-      }
     }
-    // End of accumulation loop. The variable `kc` contains the amount by which
-    // we advanced the `va` pointers, so we rewind by this amount now.
-    a0 = (const int8_t*)((uintptr_t)a0 - kc);
-    a1 = (const int8_t*)((uintptr_t)a1 - kc);
-    a2 = (const int8_t*)((uintptr_t)a2 - kc);
-    a3 = (const int8_t*)((uintptr_t)a3 - kc);
 
     // Post-accumulation work
-
     const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
     const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
 
@@ -310,6 +278,11 @@
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
       c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+
       nc -= 16;
     } else {
       // Final case where not all of the 16 columns fit in the destination.
diff --git a/src/qs8-gemm/gen/4x16c8-minmax-avx512skx.c b/src/qs8-gemm/gen/4x16c8-minmax-avx512skx.c
index 0ded25f..afa0f41 100644
--- a/src/qs8-gemm/gen/4x16c8-minmax-avx512skx.c
+++ b/src/qs8-gemm/gen/4x16c8-minmax-avx512skx.c
@@ -13,6 +13,7 @@
 
 #include <xnnpack/gemm.h>
 #include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx(
@@ -36,6 +37,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
diff --git a/src/qs8-gemm/gen/4x16c8-minmax-neon-mlal-padal.c b/src/qs8-gemm/gen/4x16c8-minmax-neon-mlal-padal.c
new file mode 100644
index 0000000..b375027
--- /dev/null
+++ b/src/qs8-gemm/gen/4x16c8-minmax-neon-mlal-padal.c
@@ -0,0 +1,837 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/c8-neon-mull-padal.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride);
+  int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+
+  do {
+    int32x4_t vacc0x0 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x1 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x2 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x3 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x4 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x5 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x8 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x9 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x10 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x11 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x12 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x13 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x14 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x15 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc1x0 = vacc0x0;
+    int32x4_t vacc1x1 = vacc0x1;
+    int32x4_t vacc1x2 = vacc0x2;
+    int32x4_t vacc1x3 = vacc0x3;
+    int32x4_t vacc1x4 = vacc0x4;
+    int32x4_t vacc1x5 = vacc0x5;
+    int32x4_t vacc1x6 = vacc0x6;
+    int32x4_t vacc1x7 = vacc0x7;
+    int32x4_t vacc1x8 = vacc0x8;
+    int32x4_t vacc1x9 = vacc0x9;
+    int32x4_t vacc1x10 = vacc0x10;
+    int32x4_t vacc1x11 = vacc0x11;
+    int32x4_t vacc1x12 = vacc0x12;
+    int32x4_t vacc1x13 = vacc0x13;
+    int32x4_t vacc1x14 = vacc0x14;
+    int32x4_t vacc1x15 = vacc0x15;
+    int32x4_t vacc2x0 = vacc0x0;
+    int32x4_t vacc2x1 = vacc0x1;
+    int32x4_t vacc2x2 = vacc0x2;
+    int32x4_t vacc2x3 = vacc0x3;
+    int32x4_t vacc2x4 = vacc0x4;
+    int32x4_t vacc2x5 = vacc0x5;
+    int32x4_t vacc2x6 = vacc0x6;
+    int32x4_t vacc2x7 = vacc0x7;
+    int32x4_t vacc2x8 = vacc0x8;
+    int32x4_t vacc2x9 = vacc0x9;
+    int32x4_t vacc2x10 = vacc0x10;
+    int32x4_t vacc2x11 = vacc0x11;
+    int32x4_t vacc2x12 = vacc0x12;
+    int32x4_t vacc2x13 = vacc0x13;
+    int32x4_t vacc2x14 = vacc0x14;
+    int32x4_t vacc2x15 = vacc0x15;
+    int32x4_t vacc3x0 = vacc0x0;
+    int32x4_t vacc3x1 = vacc0x1;
+    int32x4_t vacc3x2 = vacc0x2;
+    int32x4_t vacc3x3 = vacc0x3;
+    int32x4_t vacc3x4 = vacc0x4;
+    int32x4_t vacc3x5 = vacc0x5;
+    int32x4_t vacc3x6 = vacc0x6;
+    int32x4_t vacc3x7 = vacc0x7;
+    int32x4_t vacc3x8 = vacc0x8;
+    int32x4_t vacc3x9 = vacc0x9;
+    int32x4_t vacc3x10 = vacc0x10;
+    int32x4_t vacc3x11 = vacc0x11;
+    int32x4_t vacc3x12 = vacc0x12;
+    int32x4_t vacc3x13 = vacc0x13;
+    int32x4_t vacc3x14 = vacc0x14;
+    int32x4_t vacc3x15 = vacc0x15;
+
+    size_t k = kc;
+    // 2x partial unrolled loop to load 16 bytes at a time using MLA.
+    while (k >= 16 * sizeof(int8_t)) {
+      const int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va0x1 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va1x0 = vld1_s8(a1); a1 += 8;
+      const int8x8_t va1x1 = vld1_s8(a1); a1 += 8;
+      const int8x8_t va2x0 = vld1_s8(a2); a2 += 8;
+      const int8x8_t va2x1 = vld1_s8(a2); a2 += 8;
+      const int8x8_t va3x0 = vld1_s8(a3); a3 += 8;
+      const int8x8_t va3x1 = vld1_s8(a3); a3 += 8;
+
+      const int8x8_t vb0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb4x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb5x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb6x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb7x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb8x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb9x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb10x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb11x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb12x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb13x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb14x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb15x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+
+      const int8x8_t vb0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x0 = vmull_s8(vb0x0, va0x0);
+      int16x8_t vprod1x0 = vmull_s8(vb0x0, va1x0);
+      int16x8_t vprod2x0 = vmull_s8(vb0x0, va2x0);
+      int16x8_t vprod3x0 = vmull_s8(vb0x0, va3x0);
+      vprod0x0 = vmlal_s8(vprod0x0, vb0x1, va0x1);
+      vprod1x0 = vmlal_s8(vprod1x0, vb0x1, va1x1);
+      vprod2x0 = vmlal_s8(vprod2x0, vb0x1, va2x1);
+      vprod3x0 = vmlal_s8(vprod3x0, vb0x1, va3x1);
+      vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+      vacc1x0 = vpadalq_s16(vacc1x0, vprod1x0);
+      vacc2x0 = vpadalq_s16(vacc2x0, vprod2x0);
+      vacc3x0 = vpadalq_s16(vacc3x0, vprod3x0);
+      const int8x8_t vb1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x1 = vmull_s8(vb1x0, va0x0);
+      int16x8_t vprod1x1 = vmull_s8(vb1x0, va1x0);
+      int16x8_t vprod2x1 = vmull_s8(vb1x0, va2x0);
+      int16x8_t vprod3x1 = vmull_s8(vb1x0, va3x0);
+      vprod0x1 = vmlal_s8(vprod0x1, vb1x1, va0x1);
+      vprod1x1 = vmlal_s8(vprod1x1, vb1x1, va1x1);
+      vprod2x1 = vmlal_s8(vprod2x1, vb1x1, va2x1);
+      vprod3x1 = vmlal_s8(vprod3x1, vb1x1, va3x1);
+      vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+      vacc1x1 = vpadalq_s16(vacc1x1, vprod1x1);
+      vacc2x1 = vpadalq_s16(vacc2x1, vprod2x1);
+      vacc3x1 = vpadalq_s16(vacc3x1, vprod3x1);
+      const int8x8_t vb2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x2 = vmull_s8(vb2x0, va0x0);
+      int16x8_t vprod1x2 = vmull_s8(vb2x0, va1x0);
+      int16x8_t vprod2x2 = vmull_s8(vb2x0, va2x0);
+      int16x8_t vprod3x2 = vmull_s8(vb2x0, va3x0);
+      vprod0x2 = vmlal_s8(vprod0x2, vb2x1, va0x1);
+      vprod1x2 = vmlal_s8(vprod1x2, vb2x1, va1x1);
+      vprod2x2 = vmlal_s8(vprod2x2, vb2x1, va2x1);
+      vprod3x2 = vmlal_s8(vprod3x2, vb2x1, va3x1);
+      vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+      vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2);
+      vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2);
+      vacc3x2 = vpadalq_s16(vacc3x2, vprod3x2);
+      const int8x8_t vb3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x3 = vmull_s8(vb3x0, va0x0);
+      int16x8_t vprod1x3 = vmull_s8(vb3x0, va1x0);
+      int16x8_t vprod2x3 = vmull_s8(vb3x0, va2x0);
+      int16x8_t vprod3x3 = vmull_s8(vb3x0, va3x0);
+      vprod0x3 = vmlal_s8(vprod0x3, vb3x1, va0x1);
+      vprod1x3 = vmlal_s8(vprod1x3, vb3x1, va1x1);
+      vprod2x3 = vmlal_s8(vprod2x3, vb3x1, va2x1);
+      vprod3x3 = vmlal_s8(vprod3x3, vb3x1, va3x1);
+      vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+      vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3);
+      vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3);
+      vacc3x3 = vpadalq_s16(vacc3x3, vprod3x3);
+      const int8x8_t vb4x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x4 = vmull_s8(vb4x0, va0x0);
+      int16x8_t vprod1x4 = vmull_s8(vb4x0, va1x0);
+      int16x8_t vprod2x4 = vmull_s8(vb4x0, va2x0);
+      int16x8_t vprod3x4 = vmull_s8(vb4x0, va3x0);
+      vprod0x4 = vmlal_s8(vprod0x4, vb4x1, va0x1);
+      vprod1x4 = vmlal_s8(vprod1x4, vb4x1, va1x1);
+      vprod2x4 = vmlal_s8(vprod2x4, vb4x1, va2x1);
+      vprod3x4 = vmlal_s8(vprod3x4, vb4x1, va3x1);
+      vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+      vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4);
+      vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4);
+      vacc3x4 = vpadalq_s16(vacc3x4, vprod3x4);
+      const int8x8_t vb5x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x5 = vmull_s8(vb5x0, va0x0);
+      int16x8_t vprod1x5 = vmull_s8(vb5x0, va1x0);
+      int16x8_t vprod2x5 = vmull_s8(vb5x0, va2x0);
+      int16x8_t vprod3x5 = vmull_s8(vb5x0, va3x0);
+      vprod0x5 = vmlal_s8(vprod0x5, vb5x1, va0x1);
+      vprod1x5 = vmlal_s8(vprod1x5, vb5x1, va1x1);
+      vprod2x5 = vmlal_s8(vprod2x5, vb5x1, va2x1);
+      vprod3x5 = vmlal_s8(vprod3x5, vb5x1, va3x1);
+      vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+      vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5);
+      vacc2x5 = vpadalq_s16(vacc2x5, vprod2x5);
+      vacc3x5 = vpadalq_s16(vacc3x5, vprod3x5);
+      const int8x8_t vb6x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x6 = vmull_s8(vb6x0, va0x0);
+      int16x8_t vprod1x6 = vmull_s8(vb6x0, va1x0);
+      int16x8_t vprod2x6 = vmull_s8(vb6x0, va2x0);
+      int16x8_t vprod3x6 = vmull_s8(vb6x0, va3x0);
+      vprod0x6 = vmlal_s8(vprod0x6, vb6x1, va0x1);
+      vprod1x6 = vmlal_s8(vprod1x6, vb6x1, va1x1);
+      vprod2x6 = vmlal_s8(vprod2x6, vb6x1, va2x1);
+      vprod3x6 = vmlal_s8(vprod3x6, vb6x1, va3x1);
+      vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+      vacc1x6 = vpadalq_s16(vacc1x6, vprod1x6);
+      vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6);
+      vacc3x6 = vpadalq_s16(vacc3x6, vprod3x6);
+      const int8x8_t vb7x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x7 = vmull_s8(vb7x0, va0x0);
+      int16x8_t vprod1x7 = vmull_s8(vb7x0, va1x0);
+      int16x8_t vprod2x7 = vmull_s8(vb7x0, va2x0);
+      int16x8_t vprod3x7 = vmull_s8(vb7x0, va3x0);
+      vprod0x7 = vmlal_s8(vprod0x7, vb7x1, va0x1);
+      vprod1x7 = vmlal_s8(vprod1x7, vb7x1, va1x1);
+      vprod2x7 = vmlal_s8(vprod2x7, vb7x1, va2x1);
+      vprod3x7 = vmlal_s8(vprod3x7, vb7x1, va3x1);
+      vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+      vacc1x7 = vpadalq_s16(vacc1x7, vprod1x7);
+      vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7);
+      vacc3x7 = vpadalq_s16(vacc3x7, vprod3x7);
+      const int8x8_t vb8x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x8 = vmull_s8(vb8x0, va0x0);
+      int16x8_t vprod1x8 = vmull_s8(vb8x0, va1x0);
+      int16x8_t vprod2x8 = vmull_s8(vb8x0, va2x0);
+      int16x8_t vprod3x8 = vmull_s8(vb8x0, va3x0);
+      vprod0x8 = vmlal_s8(vprod0x8, vb8x1, va0x1);
+      vprod1x8 = vmlal_s8(vprod1x8, vb8x1, va1x1);
+      vprod2x8 = vmlal_s8(vprod2x8, vb8x1, va2x1);
+      vprod3x8 = vmlal_s8(vprod3x8, vb8x1, va3x1);
+      vacc0x8 = vpadalq_s16(vacc0x8, vprod0x8);
+      vacc1x8 = vpadalq_s16(vacc1x8, vprod1x8);
+      vacc2x8 = vpadalq_s16(vacc2x8, vprod2x8);
+      vacc3x8 = vpadalq_s16(vacc3x8, vprod3x8);
+      const int8x8_t vb9x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x9 = vmull_s8(vb9x0, va0x0);
+      int16x8_t vprod1x9 = vmull_s8(vb9x0, va1x0);
+      int16x8_t vprod2x9 = vmull_s8(vb9x0, va2x0);
+      int16x8_t vprod3x9 = vmull_s8(vb9x0, va3x0);
+      vprod0x9 = vmlal_s8(vprod0x9, vb9x1, va0x1);
+      vprod1x9 = vmlal_s8(vprod1x9, vb9x1, va1x1);
+      vprod2x9 = vmlal_s8(vprod2x9, vb9x1, va2x1);
+      vprod3x9 = vmlal_s8(vprod3x9, vb9x1, va3x1);
+      vacc0x9 = vpadalq_s16(vacc0x9, vprod0x9);
+      vacc1x9 = vpadalq_s16(vacc1x9, vprod1x9);
+      vacc2x9 = vpadalq_s16(vacc2x9, vprod2x9);
+      vacc3x9 = vpadalq_s16(vacc3x9, vprod3x9);
+      const int8x8_t vb10x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x10 = vmull_s8(vb10x0, va0x0);
+      int16x8_t vprod1x10 = vmull_s8(vb10x0, va1x0);
+      int16x8_t vprod2x10 = vmull_s8(vb10x0, va2x0);
+      int16x8_t vprod3x10 = vmull_s8(vb10x0, va3x0);
+      vprod0x10 = vmlal_s8(vprod0x10, vb10x1, va0x1);
+      vprod1x10 = vmlal_s8(vprod1x10, vb10x1, va1x1);
+      vprod2x10 = vmlal_s8(vprod2x10, vb10x1, va2x1);
+      vprod3x10 = vmlal_s8(vprod3x10, vb10x1, va3x1);
+      vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10);
+      vacc1x10 = vpadalq_s16(vacc1x10, vprod1x10);
+      vacc2x10 = vpadalq_s16(vacc2x10, vprod2x10);
+      vacc3x10 = vpadalq_s16(vacc3x10, vprod3x10);
+      const int8x8_t vb11x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x11 = vmull_s8(vb11x0, va0x0);
+      int16x8_t vprod1x11 = vmull_s8(vb11x0, va1x0);
+      int16x8_t vprod2x11 = vmull_s8(vb11x0, va2x0);
+      int16x8_t vprod3x11 = vmull_s8(vb11x0, va3x0);
+      vprod0x11 = vmlal_s8(vprod0x11, vb11x1, va0x1);
+      vprod1x11 = vmlal_s8(vprod1x11, vb11x1, va1x1);
+      vprod2x11 = vmlal_s8(vprod2x11, vb11x1, va2x1);
+      vprod3x11 = vmlal_s8(vprod3x11, vb11x1, va3x1);
+      vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11);
+      vacc1x11 = vpadalq_s16(vacc1x11, vprod1x11);
+      vacc2x11 = vpadalq_s16(vacc2x11, vprod2x11);
+      vacc3x11 = vpadalq_s16(vacc3x11, vprod3x11);
+      const int8x8_t vb12x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x12 = vmull_s8(vb12x0, va0x0);
+      int16x8_t vprod1x12 = vmull_s8(vb12x0, va1x0);
+      int16x8_t vprod2x12 = vmull_s8(vb12x0, va2x0);
+      int16x8_t vprod3x12 = vmull_s8(vb12x0, va3x0);
+      vprod0x12 = vmlal_s8(vprod0x12, vb12x1, va0x1);
+      vprod1x12 = vmlal_s8(vprod1x12, vb12x1, va1x1);
+      vprod2x12 = vmlal_s8(vprod2x12, vb12x1, va2x1);
+      vprod3x12 = vmlal_s8(vprod3x12, vb12x1, va3x1);
+      vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12);
+      vacc1x12 = vpadalq_s16(vacc1x12, vprod1x12);
+      vacc2x12 = vpadalq_s16(vacc2x12, vprod2x12);
+      vacc3x12 = vpadalq_s16(vacc3x12, vprod3x12);
+      const int8x8_t vb13x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x13 = vmull_s8(vb13x0, va0x0);
+      int16x8_t vprod1x13 = vmull_s8(vb13x0, va1x0);
+      int16x8_t vprod2x13 = vmull_s8(vb13x0, va2x0);
+      int16x8_t vprod3x13 = vmull_s8(vb13x0, va3x0);
+      vprod0x13 = vmlal_s8(vprod0x13, vb13x1, va0x1);
+      vprod1x13 = vmlal_s8(vprod1x13, vb13x1, va1x1);
+      vprod2x13 = vmlal_s8(vprod2x13, vb13x1, va2x1);
+      vprod3x13 = vmlal_s8(vprod3x13, vb13x1, va3x1);
+      vacc0x13 = vpadalq_s16(vacc0x13, vprod0x13);
+      vacc1x13 = vpadalq_s16(vacc1x13, vprod1x13);
+      vacc2x13 = vpadalq_s16(vacc2x13, vprod2x13);
+      vacc3x13 = vpadalq_s16(vacc3x13, vprod3x13);
+      const int8x8_t vb14x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x14 = vmull_s8(vb14x0, va0x0);
+      int16x8_t vprod1x14 = vmull_s8(vb14x0, va1x0);
+      int16x8_t vprod2x14 = vmull_s8(vb14x0, va2x0);
+      int16x8_t vprod3x14 = vmull_s8(vb14x0, va3x0);
+      vprod0x14 = vmlal_s8(vprod0x14, vb14x1, va0x1);
+      vprod1x14 = vmlal_s8(vprod1x14, vb14x1, va1x1);
+      vprod2x14 = vmlal_s8(vprod2x14, vb14x1, va2x1);
+      vprod3x14 = vmlal_s8(vprod3x14, vb14x1, va3x1);
+      vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14);
+      vacc1x14 = vpadalq_s16(vacc1x14, vprod1x14);
+      vacc2x14 = vpadalq_s16(vacc2x14, vprod2x14);
+      vacc3x14 = vpadalq_s16(vacc3x14, vprod3x14);
+      const int8x8_t vb15x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x15 = vmull_s8(vb15x0, va0x0);
+      int16x8_t vprod1x15 = vmull_s8(vb15x0, va1x0);
+      int16x8_t vprod2x15 = vmull_s8(vb15x0, va2x0);
+      int16x8_t vprod3x15 = vmull_s8(vb15x0, va3x0);
+      vprod0x15 = vmlal_s8(vprod0x15, vb15x1, va0x1);
+      vprod1x15 = vmlal_s8(vprod1x15, vb15x1, va1x1);
+      vprod2x15 = vmlal_s8(vprod2x15, vb15x1, va2x1);
+      vprod3x15 = vmlal_s8(vprod3x15, vb15x1, va3x1);
+      vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15);
+      vacc1x15 = vpadalq_s16(vacc1x15, vprod1x15);
+      vacc2x15 = vpadalq_s16(vacc2x15, vprod2x15);
+      vacc3x15 = vpadalq_s16(vacc3x15, vprod3x15);
+
+      k -= 16 * sizeof(int8_t);
+    }
+
+    // Handle 8 bytes at a time using MUL.
+    if (k > 0) {
+      const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+      const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+      const int8x8_t va3 = vld1_s8(a3); a3 += 8;
+
+      const int8x8_t vb0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x0 = vmull_s8(vb0, va0);
+      const int16x8_t vprod1x0 = vmull_s8(vb0, va1);
+      const int16x8_t vprod2x0 = vmull_s8(vb0, va2);
+      const int16x8_t vprod3x0 = vmull_s8(vb0, va3);
+      vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+      vacc1x0 = vpadalq_s16(vacc1x0, vprod1x0);
+      vacc2x0 = vpadalq_s16(vacc2x0, vprod2x0);
+      vacc3x0 = vpadalq_s16(vacc3x0, vprod3x0);
+      const int8x8_t vb1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x1 = vmull_s8(vb1, va0);
+      const int16x8_t vprod1x1 = vmull_s8(vb1, va1);
+      const int16x8_t vprod2x1 = vmull_s8(vb1, va2);
+      const int16x8_t vprod3x1 = vmull_s8(vb1, va3);
+      vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+      vacc1x1 = vpadalq_s16(vacc1x1, vprod1x1);
+      vacc2x1 = vpadalq_s16(vacc2x1, vprod2x1);
+      vacc3x1 = vpadalq_s16(vacc3x1, vprod3x1);
+      const int8x8_t vb2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x2 = vmull_s8(vb2, va0);
+      const int16x8_t vprod1x2 = vmull_s8(vb2, va1);
+      const int16x8_t vprod2x2 = vmull_s8(vb2, va2);
+      const int16x8_t vprod3x2 = vmull_s8(vb2, va3);
+      vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+      vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2);
+      vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2);
+      vacc3x2 = vpadalq_s16(vacc3x2, vprod3x2);
+      const int8x8_t vb3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x3 = vmull_s8(vb3, va0);
+      const int16x8_t vprod1x3 = vmull_s8(vb3, va1);
+      const int16x8_t vprod2x3 = vmull_s8(vb3, va2);
+      const int16x8_t vprod3x3 = vmull_s8(vb3, va3);
+      vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+      vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3);
+      vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3);
+      vacc3x3 = vpadalq_s16(vacc3x3, vprod3x3);
+      const int8x8_t vb4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x4 = vmull_s8(vb4, va0);
+      const int16x8_t vprod1x4 = vmull_s8(vb4, va1);
+      const int16x8_t vprod2x4 = vmull_s8(vb4, va2);
+      const int16x8_t vprod3x4 = vmull_s8(vb4, va3);
+      vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+      vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4);
+      vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4);
+      vacc3x4 = vpadalq_s16(vacc3x4, vprod3x4);
+      const int8x8_t vb5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x5 = vmull_s8(vb5, va0);
+      const int16x8_t vprod1x5 = vmull_s8(vb5, va1);
+      const int16x8_t vprod2x5 = vmull_s8(vb5, va2);
+      const int16x8_t vprod3x5 = vmull_s8(vb5, va3);
+      vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+      vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5);
+      vacc2x5 = vpadalq_s16(vacc2x5, vprod2x5);
+      vacc3x5 = vpadalq_s16(vacc3x5, vprod3x5);
+      const int8x8_t vb6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x6 = vmull_s8(vb6, va0);
+      const int16x8_t vprod1x6 = vmull_s8(vb6, va1);
+      const int16x8_t vprod2x6 = vmull_s8(vb6, va2);
+      const int16x8_t vprod3x6 = vmull_s8(vb6, va3);
+      vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+      vacc1x6 = vpadalq_s16(vacc1x6, vprod1x6);
+      vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6);
+      vacc3x6 = vpadalq_s16(vacc3x6, vprod3x6);
+      const int8x8_t vb7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x7 = vmull_s8(vb7, va0);
+      const int16x8_t vprod1x7 = vmull_s8(vb7, va1);
+      const int16x8_t vprod2x7 = vmull_s8(vb7, va2);
+      const int16x8_t vprod3x7 = vmull_s8(vb7, va3);
+      vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+      vacc1x7 = vpadalq_s16(vacc1x7, vprod1x7);
+      vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7);
+      vacc3x7 = vpadalq_s16(vacc3x7, vprod3x7);
+      const int8x8_t vb8 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x8 = vmull_s8(vb8, va0);
+      const int16x8_t vprod1x8 = vmull_s8(vb8, va1);
+      const int16x8_t vprod2x8 = vmull_s8(vb8, va2);
+      const int16x8_t vprod3x8 = vmull_s8(vb8, va3);
+      vacc0x8 = vpadalq_s16(vacc0x8, vprod0x8);
+      vacc1x8 = vpadalq_s16(vacc1x8, vprod1x8);
+      vacc2x8 = vpadalq_s16(vacc2x8, vprod2x8);
+      vacc3x8 = vpadalq_s16(vacc3x8, vprod3x8);
+      const int8x8_t vb9 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x9 = vmull_s8(vb9, va0);
+      const int16x8_t vprod1x9 = vmull_s8(vb9, va1);
+      const int16x8_t vprod2x9 = vmull_s8(vb9, va2);
+      const int16x8_t vprod3x9 = vmull_s8(vb9, va3);
+      vacc0x9 = vpadalq_s16(vacc0x9, vprod0x9);
+      vacc1x9 = vpadalq_s16(vacc1x9, vprod1x9);
+      vacc2x9 = vpadalq_s16(vacc2x9, vprod2x9);
+      vacc3x9 = vpadalq_s16(vacc3x9, vprod3x9);
+      const int8x8_t vb10 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x10 = vmull_s8(vb10, va0);
+      const int16x8_t vprod1x10 = vmull_s8(vb10, va1);
+      const int16x8_t vprod2x10 = vmull_s8(vb10, va2);
+      const int16x8_t vprod3x10 = vmull_s8(vb10, va3);
+      vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10);
+      vacc1x10 = vpadalq_s16(vacc1x10, vprod1x10);
+      vacc2x10 = vpadalq_s16(vacc2x10, vprod2x10);
+      vacc3x10 = vpadalq_s16(vacc3x10, vprod3x10);
+      const int8x8_t vb11 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x11 = vmull_s8(vb11, va0);
+      const int16x8_t vprod1x11 = vmull_s8(vb11, va1);
+      const int16x8_t vprod2x11 = vmull_s8(vb11, va2);
+      const int16x8_t vprod3x11 = vmull_s8(vb11, va3);
+      vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11);
+      vacc1x11 = vpadalq_s16(vacc1x11, vprod1x11);
+      vacc2x11 = vpadalq_s16(vacc2x11, vprod2x11);
+      vacc3x11 = vpadalq_s16(vacc3x11, vprod3x11);
+      const int8x8_t vb12 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x12 = vmull_s8(vb12, va0);
+      const int16x8_t vprod1x12 = vmull_s8(vb12, va1);
+      const int16x8_t vprod2x12 = vmull_s8(vb12, va2);
+      const int16x8_t vprod3x12 = vmull_s8(vb12, va3);
+      vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12);
+      vacc1x12 = vpadalq_s16(vacc1x12, vprod1x12);
+      vacc2x12 = vpadalq_s16(vacc2x12, vprod2x12);
+      vacc3x12 = vpadalq_s16(vacc3x12, vprod3x12);
+      const int8x8_t vb13 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x13 = vmull_s8(vb13, va0);
+      const int16x8_t vprod1x13 = vmull_s8(vb13, va1);
+      const int16x8_t vprod2x13 = vmull_s8(vb13, va2);
+      const int16x8_t vprod3x13 = vmull_s8(vb13, va3);
+      vacc0x13 = vpadalq_s16(vacc0x13, vprod0x13);
+      vacc1x13 = vpadalq_s16(vacc1x13, vprod1x13);
+      vacc2x13 = vpadalq_s16(vacc2x13, vprod2x13);
+      vacc3x13 = vpadalq_s16(vacc3x13, vprod3x13);
+      const int8x8_t vb14 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x14 = vmull_s8(vb14, va0);
+      const int16x8_t vprod1x14 = vmull_s8(vb14, va1);
+      const int16x8_t vprod2x14 = vmull_s8(vb14, va2);
+      const int16x8_t vprod3x14 = vmull_s8(vb14, va3);
+      vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14);
+      vacc1x14 = vpadalq_s16(vacc1x14, vprod1x14);
+      vacc2x14 = vpadalq_s16(vacc2x14, vprod2x14);
+      vacc3x14 = vpadalq_s16(vacc3x14, vprod3x14);
+      const int8x8_t vb15 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x15 = vmull_s8(vb15, va0);
+      const int16x8_t vprod1x15 = vmull_s8(vb15, va1);
+      const int16x8_t vprod2x15 = vmull_s8(vb15, va2);
+      const int16x8_t vprod3x15 = vmull_s8(vb15, va3);
+      vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15);
+      vacc1x15 = vpadalq_s16(vacc1x15, vprod1x15);
+      vacc2x15 = vpadalq_s16(vacc2x15, vprod2x15);
+      vacc3x15 = vpadalq_s16(vacc3x15, vprod3x15);
+
+      k -= 8 * sizeof(int8_t);
+    }
+
+#if XNN_ARCH_ARM64
+    const int32x4_t vsum0x01 = vpaddq_s32(vacc0x0, vacc0x1);
+    const int32x4_t vsum0x23 = vpaddq_s32(vacc0x2, vacc0x3);
+    const int32x4_t vsum0x45 = vpaddq_s32(vacc0x4, vacc0x5);
+    const int32x4_t vsum0x67 = vpaddq_s32(vacc0x6, vacc0x7);
+    const int32x4_t vsum0x89 = vpaddq_s32(vacc0x8, vacc0x9);
+    const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11);
+    const int32x4_t vsum0xCD = vpaddq_s32(vacc0x12, vacc0x13);
+    const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15);
+    const int32x4_t vsum1x01 = vpaddq_s32(vacc1x0, vacc1x1);
+    const int32x4_t vsum1x23 = vpaddq_s32(vacc1x2, vacc1x3);
+    const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5);
+    const int32x4_t vsum1x67 = vpaddq_s32(vacc1x6, vacc1x7);
+    const int32x4_t vsum1x89 = vpaddq_s32(vacc1x8, vacc1x9);
+    const int32x4_t vsum1xAB = vpaddq_s32(vacc1x10, vacc1x11);
+    const int32x4_t vsum1xCD = vpaddq_s32(vacc1x12, vacc1x13);
+    const int32x4_t vsum1xEF = vpaddq_s32(vacc1x14, vacc1x15);
+    const int32x4_t vsum2x01 = vpaddq_s32(vacc2x0, vacc2x1);
+    const int32x4_t vsum2x23 = vpaddq_s32(vacc2x2, vacc2x3);
+    const int32x4_t vsum2x45 = vpaddq_s32(vacc2x4, vacc2x5);
+    const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7);
+    const int32x4_t vsum2x89 = vpaddq_s32(vacc2x8, vacc2x9);
+    const int32x4_t vsum2xAB = vpaddq_s32(vacc2x10, vacc2x11);
+    const int32x4_t vsum2xCD = vpaddq_s32(vacc2x12, vacc2x13);
+    const int32x4_t vsum2xEF = vpaddq_s32(vacc2x14, vacc2x15);
+    const int32x4_t vsum3x01 = vpaddq_s32(vacc3x0, vacc3x1);
+    const int32x4_t vsum3x23 = vpaddq_s32(vacc3x2, vacc3x3);
+    const int32x4_t vsum3x45 = vpaddq_s32(vacc3x4, vacc3x5);
+    const int32x4_t vsum3x67 = vpaddq_s32(vacc3x6, vacc3x7);
+    const int32x4_t vsum3x89 = vpaddq_s32(vacc3x8, vacc3x9);
+    const int32x4_t vsum3xAB = vpaddq_s32(vacc3x10, vacc3x11);
+    const int32x4_t vsum3xCD = vpaddq_s32(vacc3x12, vacc3x13);
+    const int32x4_t vsum3xEF = vpaddq_s32(vacc3x14, vacc3x15);
+    int32x4_t vacc0x0123 = vpaddq_s32(vsum0x01, vsum0x23);
+    int32x4_t vacc0x4567 = vpaddq_s32(vsum0x45, vsum0x67);
+    int32x4_t vacc0x89AB = vpaddq_s32(vsum0x89, vsum0xAB);
+    int32x4_t vacc0xCDEF = vpaddq_s32(vsum0xCD, vsum0xEF);
+    int32x4_t vacc1x0123 = vpaddq_s32(vsum1x01, vsum1x23);
+    int32x4_t vacc1x4567 = vpaddq_s32(vsum1x45, vsum1x67);
+    int32x4_t vacc1x89AB = vpaddq_s32(vsum1x89, vsum1xAB);
+    int32x4_t vacc1xCDEF = vpaddq_s32(vsum1xCD, vsum1xEF);
+    int32x4_t vacc2x0123 = vpaddq_s32(vsum2x01, vsum2x23);
+    int32x4_t vacc2x4567 = vpaddq_s32(vsum2x45, vsum2x67);
+    int32x4_t vacc2x89AB = vpaddq_s32(vsum2x89, vsum2xAB);
+    int32x4_t vacc2xCDEF = vpaddq_s32(vsum2xCD, vsum2xEF);
+    int32x4_t vacc3x0123 = vpaddq_s32(vsum3x01, vsum3x23);
+    int32x4_t vacc3x4567 = vpaddq_s32(vsum3x45, vsum3x67);
+    int32x4_t vacc3x89AB = vpaddq_s32(vsum3x89, vsum3xAB);
+    int32x4_t vacc3xCDEF = vpaddq_s32(vsum3xCD, vsum3xEF);
+#else
+    const int32x2_t vpsum0x0 = vadd_s32(vget_low_s32(vacc0x0), vget_high_s32(vacc0x0));
+    const int32x2_t vpsum0x1 = vadd_s32(vget_low_s32(vacc0x1), vget_high_s32(vacc0x1));
+    const int32x2_t vpsum0x2 = vadd_s32(vget_low_s32(vacc0x2), vget_high_s32(vacc0x2));
+    const int32x2_t vpsum0x3 = vadd_s32(vget_low_s32(vacc0x3), vget_high_s32(vacc0x3));
+    const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1);
+    const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3);
+    int32x4_t vacc0x0123 = vcombine_s32(vsum0x01, vsum0x23 );
+    const int32x2_t vpsum0x4 = vadd_s32(vget_low_s32(vacc0x4), vget_high_s32(vacc0x4));
+    const int32x2_t vpsum0x5 = vadd_s32(vget_low_s32(vacc0x5), vget_high_s32(vacc0x5));
+    const int32x2_t vpsum0x6 = vadd_s32(vget_low_s32(vacc0x6), vget_high_s32(vacc0x6));
+    const int32x2_t vpsum0x7 = vadd_s32(vget_low_s32(vacc0x7), vget_high_s32(vacc0x7));
+    const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5);
+    const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7);
+    int32x4_t vacc0x4567 = vcombine_s32(vsum0x45, vsum0x67 );
+    const int32x2_t vpsum0x8 = vadd_s32(vget_low_s32(vacc0x8), vget_high_s32(vacc0x8));
+    const int32x2_t vpsum0x9 = vadd_s32(vget_low_s32(vacc0x9), vget_high_s32(vacc0x9));
+    const int32x2_t vpsum0xA = vadd_s32(vget_low_s32(vacc0x10), vget_high_s32(vacc0x10));
+    const int32x2_t vpsum0xB = vadd_s32(vget_low_s32(vacc0x11), vget_high_s32(vacc0x11));
+    const int32x2_t vsum0x89 = vpadd_s32(vpsum0x8, vpsum0x9);
+    const int32x2_t vsum0xAB = vpadd_s32(vpsum0xA, vpsum0xB);
+    int32x4_t vacc0x89AB = vcombine_s32(vsum0x89, vsum0xAB );
+    const int32x2_t vpsum0xC = vadd_s32(vget_low_s32(vacc0x12), vget_high_s32(vacc0x12));
+    const int32x2_t vpsum0xD = vadd_s32(vget_low_s32(vacc0x13), vget_high_s32(vacc0x13));
+    const int32x2_t vpsum0xE = vadd_s32(vget_low_s32(vacc0x14), vget_high_s32(vacc0x14));
+    const int32x2_t vpsum0xF = vadd_s32(vget_low_s32(vacc0x15), vget_high_s32(vacc0x15));
+    const int32x2_t vsum0xCD = vpadd_s32(vpsum0xC, vpsum0xD);
+    const int32x2_t vsum0xEF = vpadd_s32(vpsum0xE, vpsum0xF);
+    int32x4_t vacc0xCDEF = vcombine_s32(vsum0xCD, vsum0xEF );
+    const int32x2_t vpsum1x0 = vadd_s32(vget_low_s32(vacc1x0), vget_high_s32(vacc1x0));
+    const int32x2_t vpsum1x1 = vadd_s32(vget_low_s32(vacc1x1), vget_high_s32(vacc1x1));
+    const int32x2_t vpsum1x2 = vadd_s32(vget_low_s32(vacc1x2), vget_high_s32(vacc1x2));
+    const int32x2_t vpsum1x3 = vadd_s32(vget_low_s32(vacc1x3), vget_high_s32(vacc1x3));
+    const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1);
+    const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3);
+    int32x4_t vacc1x0123 = vcombine_s32(vsum1x01, vsum1x23 );
+    const int32x2_t vpsum1x4 = vadd_s32(vget_low_s32(vacc1x4), vget_high_s32(vacc1x4));
+    const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5));
+    const int32x2_t vpsum1x6 = vadd_s32(vget_low_s32(vacc1x6), vget_high_s32(vacc1x6));
+    const int32x2_t vpsum1x7 = vadd_s32(vget_low_s32(vacc1x7), vget_high_s32(vacc1x7));
+    const int32x2_t vsum1x45 = vpadd_s32(vpsum1x4, vpsum1x5);
+    const int32x2_t vsum1x67 = vpadd_s32(vpsum1x6, vpsum1x7);
+    int32x4_t vacc1x4567 = vcombine_s32(vsum1x45, vsum1x67 );
+    const int32x2_t vpsum1x8 = vadd_s32(vget_low_s32(vacc1x8), vget_high_s32(vacc1x8));
+    const int32x2_t vpsum1x9 = vadd_s32(vget_low_s32(vacc1x9), vget_high_s32(vacc1x9));
+    const int32x2_t vpsum1xA = vadd_s32(vget_low_s32(vacc1x10), vget_high_s32(vacc1x10));
+    const int32x2_t vpsum1xB = vadd_s32(vget_low_s32(vacc1x11), vget_high_s32(vacc1x11));
+    const int32x2_t vsum1x89 = vpadd_s32(vpsum1x8, vpsum1x9);
+    const int32x2_t vsum1xAB = vpadd_s32(vpsum1xA, vpsum1xB);
+    int32x4_t vacc1x89AB = vcombine_s32(vsum1x89, vsum1xAB );
+    const int32x2_t vpsum1xC = vadd_s32(vget_low_s32(vacc1x12), vget_high_s32(vacc1x12));
+    const int32x2_t vpsum1xD = vadd_s32(vget_low_s32(vacc1x13), vget_high_s32(vacc1x13));
+    const int32x2_t vpsum1xE = vadd_s32(vget_low_s32(vacc1x14), vget_high_s32(vacc1x14));
+    const int32x2_t vpsum1xF = vadd_s32(vget_low_s32(vacc1x15), vget_high_s32(vacc1x15));
+    const int32x2_t vsum1xCD = vpadd_s32(vpsum1xC, vpsum1xD);
+    const int32x2_t vsum1xEF = vpadd_s32(vpsum1xE, vpsum1xF);
+    int32x4_t vacc1xCDEF = vcombine_s32(vsum1xCD, vsum1xEF );
+    const int32x2_t vpsum2x0 = vadd_s32(vget_low_s32(vacc2x0), vget_high_s32(vacc2x0));
+    const int32x2_t vpsum2x1 = vadd_s32(vget_low_s32(vacc2x1), vget_high_s32(vacc2x1));
+    const int32x2_t vpsum2x2 = vadd_s32(vget_low_s32(vacc2x2), vget_high_s32(vacc2x2));
+    const int32x2_t vpsum2x3 = vadd_s32(vget_low_s32(vacc2x3), vget_high_s32(vacc2x3));
+    const int32x2_t vsum2x01 = vpadd_s32(vpsum2x0, vpsum2x1);
+    const int32x2_t vsum2x23 = vpadd_s32(vpsum2x2, vpsum2x3);
+    int32x4_t vacc2x0123 = vcombine_s32(vsum2x01, vsum2x23 );
+    const int32x2_t vpsum2x4 = vadd_s32(vget_low_s32(vacc2x4), vget_high_s32(vacc2x4));
+    const int32x2_t vpsum2x5 = vadd_s32(vget_low_s32(vacc2x5), vget_high_s32(vacc2x5));
+    const int32x2_t vpsum2x6 = vadd_s32(vget_low_s32(vacc2x6), vget_high_s32(vacc2x6));
+    const int32x2_t vpsum2x7 = vadd_s32(vget_low_s32(vacc2x7), vget_high_s32(vacc2x7));
+    const int32x2_t vsum2x45 = vpadd_s32(vpsum2x4, vpsum2x5);
+    const int32x2_t vsum2x67 = vpadd_s32(vpsum2x6, vpsum2x7);
+    int32x4_t vacc2x4567 = vcombine_s32(vsum2x45, vsum2x67 );
+    const int32x2_t vpsum2x8 = vadd_s32(vget_low_s32(vacc2x8), vget_high_s32(vacc2x8));
+    const int32x2_t vpsum2x9 = vadd_s32(vget_low_s32(vacc2x9), vget_high_s32(vacc2x9));
+    const int32x2_t vpsum2xA = vadd_s32(vget_low_s32(vacc2x10), vget_high_s32(vacc2x10));
+    const int32x2_t vpsum2xB = vadd_s32(vget_low_s32(vacc2x11), vget_high_s32(vacc2x11));
+    const int32x2_t vsum2x89 = vpadd_s32(vpsum2x8, vpsum2x9);
+    const int32x2_t vsum2xAB = vpadd_s32(vpsum2xA, vpsum2xB);
+    int32x4_t vacc2x89AB = vcombine_s32(vsum2x89, vsum2xAB );
+    const int32x2_t vpsum2xC = vadd_s32(vget_low_s32(vacc2x12), vget_high_s32(vacc2x12));
+    const int32x2_t vpsum2xD = vadd_s32(vget_low_s32(vacc2x13), vget_high_s32(vacc2x13));
+    const int32x2_t vpsum2xE = vadd_s32(vget_low_s32(vacc2x14), vget_high_s32(vacc2x14));
+    const int32x2_t vpsum2xF = vadd_s32(vget_low_s32(vacc2x15), vget_high_s32(vacc2x15));
+    const int32x2_t vsum2xCD = vpadd_s32(vpsum2xC, vpsum2xD);
+    const int32x2_t vsum2xEF = vpadd_s32(vpsum2xE, vpsum2xF);
+    int32x4_t vacc2xCDEF = vcombine_s32(vsum2xCD, vsum2xEF );
+    const int32x2_t vpsum3x0 = vadd_s32(vget_low_s32(vacc3x0), vget_high_s32(vacc3x0));
+    const int32x2_t vpsum3x1 = vadd_s32(vget_low_s32(vacc3x1), vget_high_s32(vacc3x1));
+    const int32x2_t vpsum3x2 = vadd_s32(vget_low_s32(vacc3x2), vget_high_s32(vacc3x2));
+    const int32x2_t vpsum3x3 = vadd_s32(vget_low_s32(vacc3x3), vget_high_s32(vacc3x3));
+    const int32x2_t vsum3x01 = vpadd_s32(vpsum3x0, vpsum3x1);
+    const int32x2_t vsum3x23 = vpadd_s32(vpsum3x2, vpsum3x3);
+    int32x4_t vacc3x0123 = vcombine_s32(vsum3x01, vsum3x23 );
+    const int32x2_t vpsum3x4 = vadd_s32(vget_low_s32(vacc3x4), vget_high_s32(vacc3x4));
+    const int32x2_t vpsum3x5 = vadd_s32(vget_low_s32(vacc3x5), vget_high_s32(vacc3x5));
+    const int32x2_t vpsum3x6 = vadd_s32(vget_low_s32(vacc3x6), vget_high_s32(vacc3x6));
+    const int32x2_t vpsum3x7 = vadd_s32(vget_low_s32(vacc3x7), vget_high_s32(vacc3x7));
+    const int32x2_t vsum3x45 = vpadd_s32(vpsum3x4, vpsum3x5);
+    const int32x2_t vsum3x67 = vpadd_s32(vpsum3x6, vpsum3x7);
+    int32x4_t vacc3x4567 = vcombine_s32(vsum3x45, vsum3x67 );
+    const int32x2_t vpsum3x8 = vadd_s32(vget_low_s32(vacc3x8), vget_high_s32(vacc3x8));
+    const int32x2_t vpsum3x9 = vadd_s32(vget_low_s32(vacc3x9), vget_high_s32(vacc3x9));
+    const int32x2_t vpsum3xA = vadd_s32(vget_low_s32(vacc3x10), vget_high_s32(vacc3x10));
+    const int32x2_t vpsum3xB = vadd_s32(vget_low_s32(vacc3x11), vget_high_s32(vacc3x11));
+    const int32x2_t vsum3x89 = vpadd_s32(vpsum3x8, vpsum3x9);
+    const int32x2_t vsum3xAB = vpadd_s32(vpsum3xA, vpsum3xB);
+    int32x4_t vacc3x89AB = vcombine_s32(vsum3x89, vsum3xAB );
+    const int32x2_t vpsum3xC = vadd_s32(vget_low_s32(vacc3x12), vget_high_s32(vacc3x12));
+    const int32x2_t vpsum3xD = vadd_s32(vget_low_s32(vacc3x13), vget_high_s32(vacc3x13));
+    const int32x2_t vpsum3xE = vadd_s32(vget_low_s32(vacc3x14), vget_high_s32(vacc3x14));
+    const int32x2_t vpsum3xF = vadd_s32(vget_low_s32(vacc3x15), vget_high_s32(vacc3x15));
+    const int32x2_t vsum3xCD = vpadd_s32(vpsum3xC, vpsum3xD);
+    const int32x2_t vsum3xEF = vpadd_s32(vpsum3xE, vpsum3xF);
+    int32x4_t vacc3xCDEF = vcombine_s32(vsum3xCD, vsum3xEF );
+#endif
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+    vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc1x89AB = vqrdmulhq_s32(vacc1x89AB, vmultiplier);
+    vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+    vacc2x89AB = vqrdmulhq_s32(vacc2x89AB, vmultiplier);
+    vacc2xCDEF = vqrdmulhq_s32(vacc2xCDEF, vmultiplier);
+    vacc3x0123 = vqrdmulhq_s32(vacc3x0123, vmultiplier);
+    vacc3x4567 = vqrdmulhq_s32(vacc3x4567, vmultiplier);
+    vacc3x89AB = vqrdmulhq_s32(vacc3x89AB, vmultiplier);
+    vacc3xCDEF = vqrdmulhq_s32(vacc3xCDEF, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+    vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc1x89AB = vsraq_n_s32(vacc1x89AB, vbicq_s32(vacc1x89AB, vzero_shift_mask), 31);
+    vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+    vacc2x89AB = vsraq_n_s32(vacc2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31);
+    vacc2xCDEF = vsraq_n_s32(vacc2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31);
+    vacc3x0123 = vsraq_n_s32(vacc3x0123, vbicq_s32(vacc3x0123, vzero_shift_mask), 31);
+    vacc3x4567 = vsraq_n_s32(vacc3x4567, vbicq_s32(vacc3x4567, vzero_shift_mask), 31);
+    vacc3x89AB = vsraq_n_s32(vacc3x89AB, vbicq_s32(vacc3x89AB, vzero_shift_mask), 31);
+    vacc3xCDEF = vsraq_n_s32(vacc3xCDEF, vbicq_s32(vacc3xCDEF, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_shift);
+    vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+    vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_shift);
+    vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_shift);
+    vacc3x0123 = vrshlq_s32(vacc3x0123, vright_shift);
+    vacc3x4567 = vrshlq_s32(vacc3x4567, vright_shift);
+    vacc3x89AB = vrshlq_s32(vacc3x89AB, vright_shift);
+    vacc3xCDEF = vrshlq_s32(vacc3xCDEF, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
+    const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), voutput_zero_point);
+    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+    int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
+    int8x16_t vout2x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc2x89ABCDEF);
+    int8x16_t vout3x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc3x01234567), vacc3x89ABCDEF);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567)), voutput_zero_point);
+    const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF)), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+    int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
+    int8x16_t vout2x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc2x89ABCDEF));
+    int8x16_t vout3x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc3x01234567), vqmovn_s16(vacc3x89ABCDEF));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+    vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
+    vout2x0123456789ABCDEF = vmaxq_s8(vout2x0123456789ABCDEF, voutput_min);
+    vout3x0123456789ABCDEF = vmaxq_s8(vout3x0123456789ABCDEF, voutput_min);
+
+    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+    vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
+    vout2x0123456789ABCDEF = vminq_s8(vout2x0123456789ABCDEF, voutput_max);
+    vout3x0123456789ABCDEF = vminq_s8(vout3x0123456789ABCDEF, voutput_max);
+
+    if (nc >= 16) {
+      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+      vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
+      vst1q_s8(c2 + 0, vout2x0123456789ABCDEF);
+      vst1q_s8(c3 + 0, vout3x0123456789ABCDEF);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+
+      nc -= 16;
+    } else {
+      int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
+      int8x16_t vout2x01234567_3x01234567 = vcombine_s8(vget_low_s8(vout2x0123456789ABCDEF), vget_low_s8(vout3x0123456789ABCDEF));
+      if (nc & 8) {
+        vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
+        vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
+        vst1_s8(c2, vget_low_s8(vout2x01234567_3x01234567)); c2 += 8;
+        vst1_s8(c3, vget_high_s8(vout2x01234567_3x01234567)); c3 += 8;
+        vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
+        vout2x01234567_3x01234567 = vcombine_s8(vget_high_s8(vout2x0123456789ABCDEF), vget_high_s8(vout3x0123456789ABCDEF));
+      }
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 0); c2 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c3, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 2); c3 += 4;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 0); c2 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c3, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 4); c3 += 2;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c2, vout2x01234567_3x01234567, 0);
+        vst1q_lane_s8(c3, vout2x01234567_3x01234567, 8);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/4x16c8-minmax-neon-mull-padal.c b/src/qs8-gemm/gen/4x16c8-minmax-neon-mull-padal.c
new file mode 100644
index 0000000..94cbe06
--- /dev/null
+++ b/src/qs8-gemm/gen/4x16c8-minmax-neon-mull-padal.c
@@ -0,0 +1,598 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/c8-neon-mull-padal.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride);
+  int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+
+  do {
+    int32x4_t vacc0x0 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x1 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x2 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x3 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x4 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x5 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x8 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x9 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x10 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x11 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x12 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x13 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x14 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x15 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc1x0 = vacc0x0;
+    int32x4_t vacc1x1 = vacc0x1;
+    int32x4_t vacc1x2 = vacc0x2;
+    int32x4_t vacc1x3 = vacc0x3;
+    int32x4_t vacc1x4 = vacc0x4;
+    int32x4_t vacc1x5 = vacc0x5;
+    int32x4_t vacc1x6 = vacc0x6;
+    int32x4_t vacc1x7 = vacc0x7;
+    int32x4_t vacc1x8 = vacc0x8;
+    int32x4_t vacc1x9 = vacc0x9;
+    int32x4_t vacc1x10 = vacc0x10;
+    int32x4_t vacc1x11 = vacc0x11;
+    int32x4_t vacc1x12 = vacc0x12;
+    int32x4_t vacc1x13 = vacc0x13;
+    int32x4_t vacc1x14 = vacc0x14;
+    int32x4_t vacc1x15 = vacc0x15;
+    int32x4_t vacc2x0 = vacc0x0;
+    int32x4_t vacc2x1 = vacc0x1;
+    int32x4_t vacc2x2 = vacc0x2;
+    int32x4_t vacc2x3 = vacc0x3;
+    int32x4_t vacc2x4 = vacc0x4;
+    int32x4_t vacc2x5 = vacc0x5;
+    int32x4_t vacc2x6 = vacc0x6;
+    int32x4_t vacc2x7 = vacc0x7;
+    int32x4_t vacc2x8 = vacc0x8;
+    int32x4_t vacc2x9 = vacc0x9;
+    int32x4_t vacc2x10 = vacc0x10;
+    int32x4_t vacc2x11 = vacc0x11;
+    int32x4_t vacc2x12 = vacc0x12;
+    int32x4_t vacc2x13 = vacc0x13;
+    int32x4_t vacc2x14 = vacc0x14;
+    int32x4_t vacc2x15 = vacc0x15;
+    int32x4_t vacc3x0 = vacc0x0;
+    int32x4_t vacc3x1 = vacc0x1;
+    int32x4_t vacc3x2 = vacc0x2;
+    int32x4_t vacc3x3 = vacc0x3;
+    int32x4_t vacc3x4 = vacc0x4;
+    int32x4_t vacc3x5 = vacc0x5;
+    int32x4_t vacc3x6 = vacc0x6;
+    int32x4_t vacc3x7 = vacc0x7;
+    int32x4_t vacc3x8 = vacc0x8;
+    int32x4_t vacc3x9 = vacc0x9;
+    int32x4_t vacc3x10 = vacc0x10;
+    int32x4_t vacc3x11 = vacc0x11;
+    int32x4_t vacc3x12 = vacc0x12;
+    int32x4_t vacc3x13 = vacc0x13;
+    int32x4_t vacc3x14 = vacc0x14;
+    int32x4_t vacc3x15 = vacc0x15;
+
+    size_t k = kc;
+
+    // Handle 8 bytes at a time using MUL.
+    while (k > 0) {
+      const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+      const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+      const int8x8_t va3 = vld1_s8(a3); a3 += 8;
+
+      const int8x8_t vb0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x0 = vmull_s8(vb0, va0);
+      const int16x8_t vprod1x0 = vmull_s8(vb0, va1);
+      const int16x8_t vprod2x0 = vmull_s8(vb0, va2);
+      const int16x8_t vprod3x0 = vmull_s8(vb0, va3);
+      vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+      vacc1x0 = vpadalq_s16(vacc1x0, vprod1x0);
+      vacc2x0 = vpadalq_s16(vacc2x0, vprod2x0);
+      vacc3x0 = vpadalq_s16(vacc3x0, vprod3x0);
+      const int8x8_t vb1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x1 = vmull_s8(vb1, va0);
+      const int16x8_t vprod1x1 = vmull_s8(vb1, va1);
+      const int16x8_t vprod2x1 = vmull_s8(vb1, va2);
+      const int16x8_t vprod3x1 = vmull_s8(vb1, va3);
+      vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+      vacc1x1 = vpadalq_s16(vacc1x1, vprod1x1);
+      vacc2x1 = vpadalq_s16(vacc2x1, vprod2x1);
+      vacc3x1 = vpadalq_s16(vacc3x1, vprod3x1);
+      const int8x8_t vb2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x2 = vmull_s8(vb2, va0);
+      const int16x8_t vprod1x2 = vmull_s8(vb2, va1);
+      const int16x8_t vprod2x2 = vmull_s8(vb2, va2);
+      const int16x8_t vprod3x2 = vmull_s8(vb2, va3);
+      vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+      vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2);
+      vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2);
+      vacc3x2 = vpadalq_s16(vacc3x2, vprod3x2);
+      const int8x8_t vb3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x3 = vmull_s8(vb3, va0);
+      const int16x8_t vprod1x3 = vmull_s8(vb3, va1);
+      const int16x8_t vprod2x3 = vmull_s8(vb3, va2);
+      const int16x8_t vprod3x3 = vmull_s8(vb3, va3);
+      vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+      vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3);
+      vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3);
+      vacc3x3 = vpadalq_s16(vacc3x3, vprod3x3);
+      const int8x8_t vb4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x4 = vmull_s8(vb4, va0);
+      const int16x8_t vprod1x4 = vmull_s8(vb4, va1);
+      const int16x8_t vprod2x4 = vmull_s8(vb4, va2);
+      const int16x8_t vprod3x4 = vmull_s8(vb4, va3);
+      vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+      vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4);
+      vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4);
+      vacc3x4 = vpadalq_s16(vacc3x4, vprod3x4);
+      const int8x8_t vb5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x5 = vmull_s8(vb5, va0);
+      const int16x8_t vprod1x5 = vmull_s8(vb5, va1);
+      const int16x8_t vprod2x5 = vmull_s8(vb5, va2);
+      const int16x8_t vprod3x5 = vmull_s8(vb5, va3);
+      vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+      vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5);
+      vacc2x5 = vpadalq_s16(vacc2x5, vprod2x5);
+      vacc3x5 = vpadalq_s16(vacc3x5, vprod3x5);
+      const int8x8_t vb6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x6 = vmull_s8(vb6, va0);
+      const int16x8_t vprod1x6 = vmull_s8(vb6, va1);
+      const int16x8_t vprod2x6 = vmull_s8(vb6, va2);
+      const int16x8_t vprod3x6 = vmull_s8(vb6, va3);
+      vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+      vacc1x6 = vpadalq_s16(vacc1x6, vprod1x6);
+      vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6);
+      vacc3x6 = vpadalq_s16(vacc3x6, vprod3x6);
+      const int8x8_t vb7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x7 = vmull_s8(vb7, va0);
+      const int16x8_t vprod1x7 = vmull_s8(vb7, va1);
+      const int16x8_t vprod2x7 = vmull_s8(vb7, va2);
+      const int16x8_t vprod3x7 = vmull_s8(vb7, va3);
+      vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+      vacc1x7 = vpadalq_s16(vacc1x7, vprod1x7);
+      vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7);
+      vacc3x7 = vpadalq_s16(vacc3x7, vprod3x7);
+      const int8x8_t vb8 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x8 = vmull_s8(vb8, va0);
+      const int16x8_t vprod1x8 = vmull_s8(vb8, va1);
+      const int16x8_t vprod2x8 = vmull_s8(vb8, va2);
+      const int16x8_t vprod3x8 = vmull_s8(vb8, va3);
+      vacc0x8 = vpadalq_s16(vacc0x8, vprod0x8);
+      vacc1x8 = vpadalq_s16(vacc1x8, vprod1x8);
+      vacc2x8 = vpadalq_s16(vacc2x8, vprod2x8);
+      vacc3x8 = vpadalq_s16(vacc3x8, vprod3x8);
+      const int8x8_t vb9 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x9 = vmull_s8(vb9, va0);
+      const int16x8_t vprod1x9 = vmull_s8(vb9, va1);
+      const int16x8_t vprod2x9 = vmull_s8(vb9, va2);
+      const int16x8_t vprod3x9 = vmull_s8(vb9, va3);
+      vacc0x9 = vpadalq_s16(vacc0x9, vprod0x9);
+      vacc1x9 = vpadalq_s16(vacc1x9, vprod1x9);
+      vacc2x9 = vpadalq_s16(vacc2x9, vprod2x9);
+      vacc3x9 = vpadalq_s16(vacc3x9, vprod3x9);
+      const int8x8_t vb10 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x10 = vmull_s8(vb10, va0);
+      const int16x8_t vprod1x10 = vmull_s8(vb10, va1);
+      const int16x8_t vprod2x10 = vmull_s8(vb10, va2);
+      const int16x8_t vprod3x10 = vmull_s8(vb10, va3);
+      vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10);
+      vacc1x10 = vpadalq_s16(vacc1x10, vprod1x10);
+      vacc2x10 = vpadalq_s16(vacc2x10, vprod2x10);
+      vacc3x10 = vpadalq_s16(vacc3x10, vprod3x10);
+      const int8x8_t vb11 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x11 = vmull_s8(vb11, va0);
+      const int16x8_t vprod1x11 = vmull_s8(vb11, va1);
+      const int16x8_t vprod2x11 = vmull_s8(vb11, va2);
+      const int16x8_t vprod3x11 = vmull_s8(vb11, va3);
+      vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11);
+      vacc1x11 = vpadalq_s16(vacc1x11, vprod1x11);
+      vacc2x11 = vpadalq_s16(vacc2x11, vprod2x11);
+      vacc3x11 = vpadalq_s16(vacc3x11, vprod3x11);
+      const int8x8_t vb12 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x12 = vmull_s8(vb12, va0);
+      const int16x8_t vprod1x12 = vmull_s8(vb12, va1);
+      const int16x8_t vprod2x12 = vmull_s8(vb12, va2);
+      const int16x8_t vprod3x12 = vmull_s8(vb12, va3);
+      vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12);
+      vacc1x12 = vpadalq_s16(vacc1x12, vprod1x12);
+      vacc2x12 = vpadalq_s16(vacc2x12, vprod2x12);
+      vacc3x12 = vpadalq_s16(vacc3x12, vprod3x12);
+      const int8x8_t vb13 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x13 = vmull_s8(vb13, va0);
+      const int16x8_t vprod1x13 = vmull_s8(vb13, va1);
+      const int16x8_t vprod2x13 = vmull_s8(vb13, va2);
+      const int16x8_t vprod3x13 = vmull_s8(vb13, va3);
+      vacc0x13 = vpadalq_s16(vacc0x13, vprod0x13);
+      vacc1x13 = vpadalq_s16(vacc1x13, vprod1x13);
+      vacc2x13 = vpadalq_s16(vacc2x13, vprod2x13);
+      vacc3x13 = vpadalq_s16(vacc3x13, vprod3x13);
+      const int8x8_t vb14 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x14 = vmull_s8(vb14, va0);
+      const int16x8_t vprod1x14 = vmull_s8(vb14, va1);
+      const int16x8_t vprod2x14 = vmull_s8(vb14, va2);
+      const int16x8_t vprod3x14 = vmull_s8(vb14, va3);
+      vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14);
+      vacc1x14 = vpadalq_s16(vacc1x14, vprod1x14);
+      vacc2x14 = vpadalq_s16(vacc2x14, vprod2x14);
+      vacc3x14 = vpadalq_s16(vacc3x14, vprod3x14);
+      const int8x8_t vb15 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x15 = vmull_s8(vb15, va0);
+      const int16x8_t vprod1x15 = vmull_s8(vb15, va1);
+      const int16x8_t vprod2x15 = vmull_s8(vb15, va2);
+      const int16x8_t vprod3x15 = vmull_s8(vb15, va3);
+      vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15);
+      vacc1x15 = vpadalq_s16(vacc1x15, vprod1x15);
+      vacc2x15 = vpadalq_s16(vacc2x15, vprod2x15);
+      vacc3x15 = vpadalq_s16(vacc3x15, vprod3x15);
+
+      k -= 8 * sizeof(int8_t);
+    }
+
+#if XNN_ARCH_ARM64
+    const int32x4_t vsum0x01 = vpaddq_s32(vacc0x0, vacc0x1);
+    const int32x4_t vsum0x23 = vpaddq_s32(vacc0x2, vacc0x3);
+    const int32x4_t vsum0x45 = vpaddq_s32(vacc0x4, vacc0x5);
+    const int32x4_t vsum0x67 = vpaddq_s32(vacc0x6, vacc0x7);
+    const int32x4_t vsum0x89 = vpaddq_s32(vacc0x8, vacc0x9);
+    const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11);
+    const int32x4_t vsum0xCD = vpaddq_s32(vacc0x12, vacc0x13);
+    const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15);
+    const int32x4_t vsum1x01 = vpaddq_s32(vacc1x0, vacc1x1);
+    const int32x4_t vsum1x23 = vpaddq_s32(vacc1x2, vacc1x3);
+    const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5);
+    const int32x4_t vsum1x67 = vpaddq_s32(vacc1x6, vacc1x7);
+    const int32x4_t vsum1x89 = vpaddq_s32(vacc1x8, vacc1x9);
+    const int32x4_t vsum1xAB = vpaddq_s32(vacc1x10, vacc1x11);
+    const int32x4_t vsum1xCD = vpaddq_s32(vacc1x12, vacc1x13);
+    const int32x4_t vsum1xEF = vpaddq_s32(vacc1x14, vacc1x15);
+    const int32x4_t vsum2x01 = vpaddq_s32(vacc2x0, vacc2x1);
+    const int32x4_t vsum2x23 = vpaddq_s32(vacc2x2, vacc2x3);
+    const int32x4_t vsum2x45 = vpaddq_s32(vacc2x4, vacc2x5);
+    const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7);
+    const int32x4_t vsum2x89 = vpaddq_s32(vacc2x8, vacc2x9);
+    const int32x4_t vsum2xAB = vpaddq_s32(vacc2x10, vacc2x11);
+    const int32x4_t vsum2xCD = vpaddq_s32(vacc2x12, vacc2x13);
+    const int32x4_t vsum2xEF = vpaddq_s32(vacc2x14, vacc2x15);
+    const int32x4_t vsum3x01 = vpaddq_s32(vacc3x0, vacc3x1);
+    const int32x4_t vsum3x23 = vpaddq_s32(vacc3x2, vacc3x3);
+    const int32x4_t vsum3x45 = vpaddq_s32(vacc3x4, vacc3x5);
+    const int32x4_t vsum3x67 = vpaddq_s32(vacc3x6, vacc3x7);
+    const int32x4_t vsum3x89 = vpaddq_s32(vacc3x8, vacc3x9);
+    const int32x4_t vsum3xAB = vpaddq_s32(vacc3x10, vacc3x11);
+    const int32x4_t vsum3xCD = vpaddq_s32(vacc3x12, vacc3x13);
+    const int32x4_t vsum3xEF = vpaddq_s32(vacc3x14, vacc3x15);
+    int32x4_t vacc0x0123 = vpaddq_s32(vsum0x01, vsum0x23);
+    int32x4_t vacc0x4567 = vpaddq_s32(vsum0x45, vsum0x67);
+    int32x4_t vacc0x89AB = vpaddq_s32(vsum0x89, vsum0xAB);
+    int32x4_t vacc0xCDEF = vpaddq_s32(vsum0xCD, vsum0xEF);
+    int32x4_t vacc1x0123 = vpaddq_s32(vsum1x01, vsum1x23);
+    int32x4_t vacc1x4567 = vpaddq_s32(vsum1x45, vsum1x67);
+    int32x4_t vacc1x89AB = vpaddq_s32(vsum1x89, vsum1xAB);
+    int32x4_t vacc1xCDEF = vpaddq_s32(vsum1xCD, vsum1xEF);
+    int32x4_t vacc2x0123 = vpaddq_s32(vsum2x01, vsum2x23);
+    int32x4_t vacc2x4567 = vpaddq_s32(vsum2x45, vsum2x67);
+    int32x4_t vacc2x89AB = vpaddq_s32(vsum2x89, vsum2xAB);
+    int32x4_t vacc2xCDEF = vpaddq_s32(vsum2xCD, vsum2xEF);
+    int32x4_t vacc3x0123 = vpaddq_s32(vsum3x01, vsum3x23);
+    int32x4_t vacc3x4567 = vpaddq_s32(vsum3x45, vsum3x67);
+    int32x4_t vacc3x89AB = vpaddq_s32(vsum3x89, vsum3xAB);
+    int32x4_t vacc3xCDEF = vpaddq_s32(vsum3xCD, vsum3xEF);
+#else
+    const int32x2_t vpsum0x0 = vadd_s32(vget_low_s32(vacc0x0), vget_high_s32(vacc0x0));
+    const int32x2_t vpsum0x1 = vadd_s32(vget_low_s32(vacc0x1), vget_high_s32(vacc0x1));
+    const int32x2_t vpsum0x2 = vadd_s32(vget_low_s32(vacc0x2), vget_high_s32(vacc0x2));
+    const int32x2_t vpsum0x3 = vadd_s32(vget_low_s32(vacc0x3), vget_high_s32(vacc0x3));
+    const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1);
+    const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3);
+    int32x4_t vacc0x0123 = vcombine_s32(vsum0x01, vsum0x23 );
+    const int32x2_t vpsum0x4 = vadd_s32(vget_low_s32(vacc0x4), vget_high_s32(vacc0x4));
+    const int32x2_t vpsum0x5 = vadd_s32(vget_low_s32(vacc0x5), vget_high_s32(vacc0x5));
+    const int32x2_t vpsum0x6 = vadd_s32(vget_low_s32(vacc0x6), vget_high_s32(vacc0x6));
+    const int32x2_t vpsum0x7 = vadd_s32(vget_low_s32(vacc0x7), vget_high_s32(vacc0x7));
+    const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5);
+    const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7);
+    int32x4_t vacc0x4567 = vcombine_s32(vsum0x45, vsum0x67 );
+    const int32x2_t vpsum0x8 = vadd_s32(vget_low_s32(vacc0x8), vget_high_s32(vacc0x8));
+    const int32x2_t vpsum0x9 = vadd_s32(vget_low_s32(vacc0x9), vget_high_s32(vacc0x9));
+    const int32x2_t vpsum0xA = vadd_s32(vget_low_s32(vacc0x10), vget_high_s32(vacc0x10));
+    const int32x2_t vpsum0xB = vadd_s32(vget_low_s32(vacc0x11), vget_high_s32(vacc0x11));
+    const int32x2_t vsum0x89 = vpadd_s32(vpsum0x8, vpsum0x9);
+    const int32x2_t vsum0xAB = vpadd_s32(vpsum0xA, vpsum0xB);
+    int32x4_t vacc0x89AB = vcombine_s32(vsum0x89, vsum0xAB );
+    const int32x2_t vpsum0xC = vadd_s32(vget_low_s32(vacc0x12), vget_high_s32(vacc0x12));
+    const int32x2_t vpsum0xD = vadd_s32(vget_low_s32(vacc0x13), vget_high_s32(vacc0x13));
+    const int32x2_t vpsum0xE = vadd_s32(vget_low_s32(vacc0x14), vget_high_s32(vacc0x14));
+    const int32x2_t vpsum0xF = vadd_s32(vget_low_s32(vacc0x15), vget_high_s32(vacc0x15));
+    const int32x2_t vsum0xCD = vpadd_s32(vpsum0xC, vpsum0xD);
+    const int32x2_t vsum0xEF = vpadd_s32(vpsum0xE, vpsum0xF);
+    int32x4_t vacc0xCDEF = vcombine_s32(vsum0xCD, vsum0xEF );
+    const int32x2_t vpsum1x0 = vadd_s32(vget_low_s32(vacc1x0), vget_high_s32(vacc1x0));
+    const int32x2_t vpsum1x1 = vadd_s32(vget_low_s32(vacc1x1), vget_high_s32(vacc1x1));
+    const int32x2_t vpsum1x2 = vadd_s32(vget_low_s32(vacc1x2), vget_high_s32(vacc1x2));
+    const int32x2_t vpsum1x3 = vadd_s32(vget_low_s32(vacc1x3), vget_high_s32(vacc1x3));
+    const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1);
+    const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3);
+    int32x4_t vacc1x0123 = vcombine_s32(vsum1x01, vsum1x23 );
+    const int32x2_t vpsum1x4 = vadd_s32(vget_low_s32(vacc1x4), vget_high_s32(vacc1x4));
+    const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5));
+    const int32x2_t vpsum1x6 = vadd_s32(vget_low_s32(vacc1x6), vget_high_s32(vacc1x6));
+    const int32x2_t vpsum1x7 = vadd_s32(vget_low_s32(vacc1x7), vget_high_s32(vacc1x7));
+    const int32x2_t vsum1x45 = vpadd_s32(vpsum1x4, vpsum1x5);
+    const int32x2_t vsum1x67 = vpadd_s32(vpsum1x6, vpsum1x7);
+    int32x4_t vacc1x4567 = vcombine_s32(vsum1x45, vsum1x67 );
+    const int32x2_t vpsum1x8 = vadd_s32(vget_low_s32(vacc1x8), vget_high_s32(vacc1x8));
+    const int32x2_t vpsum1x9 = vadd_s32(vget_low_s32(vacc1x9), vget_high_s32(vacc1x9));
+    const int32x2_t vpsum1xA = vadd_s32(vget_low_s32(vacc1x10), vget_high_s32(vacc1x10));
+    const int32x2_t vpsum1xB = vadd_s32(vget_low_s32(vacc1x11), vget_high_s32(vacc1x11));
+    const int32x2_t vsum1x89 = vpadd_s32(vpsum1x8, vpsum1x9);
+    const int32x2_t vsum1xAB = vpadd_s32(vpsum1xA, vpsum1xB);
+    int32x4_t vacc1x89AB = vcombine_s32(vsum1x89, vsum1xAB );
+    const int32x2_t vpsum1xC = vadd_s32(vget_low_s32(vacc1x12), vget_high_s32(vacc1x12));
+    const int32x2_t vpsum1xD = vadd_s32(vget_low_s32(vacc1x13), vget_high_s32(vacc1x13));
+    const int32x2_t vpsum1xE = vadd_s32(vget_low_s32(vacc1x14), vget_high_s32(vacc1x14));
+    const int32x2_t vpsum1xF = vadd_s32(vget_low_s32(vacc1x15), vget_high_s32(vacc1x15));
+    const int32x2_t vsum1xCD = vpadd_s32(vpsum1xC, vpsum1xD);
+    const int32x2_t vsum1xEF = vpadd_s32(vpsum1xE, vpsum1xF);
+    int32x4_t vacc1xCDEF = vcombine_s32(vsum1xCD, vsum1xEF );
+    const int32x2_t vpsum2x0 = vadd_s32(vget_low_s32(vacc2x0), vget_high_s32(vacc2x0));
+    const int32x2_t vpsum2x1 = vadd_s32(vget_low_s32(vacc2x1), vget_high_s32(vacc2x1));
+    const int32x2_t vpsum2x2 = vadd_s32(vget_low_s32(vacc2x2), vget_high_s32(vacc2x2));
+    const int32x2_t vpsum2x3 = vadd_s32(vget_low_s32(vacc2x3), vget_high_s32(vacc2x3));
+    const int32x2_t vsum2x01 = vpadd_s32(vpsum2x0, vpsum2x1);
+    const int32x2_t vsum2x23 = vpadd_s32(vpsum2x2, vpsum2x3);
+    int32x4_t vacc2x0123 = vcombine_s32(vsum2x01, vsum2x23 );
+    const int32x2_t vpsum2x4 = vadd_s32(vget_low_s32(vacc2x4), vget_high_s32(vacc2x4));
+    const int32x2_t vpsum2x5 = vadd_s32(vget_low_s32(vacc2x5), vget_high_s32(vacc2x5));
+    const int32x2_t vpsum2x6 = vadd_s32(vget_low_s32(vacc2x6), vget_high_s32(vacc2x6));
+    const int32x2_t vpsum2x7 = vadd_s32(vget_low_s32(vacc2x7), vget_high_s32(vacc2x7));
+    const int32x2_t vsum2x45 = vpadd_s32(vpsum2x4, vpsum2x5);
+    const int32x2_t vsum2x67 = vpadd_s32(vpsum2x6, vpsum2x7);
+    int32x4_t vacc2x4567 = vcombine_s32(vsum2x45, vsum2x67 );
+    const int32x2_t vpsum2x8 = vadd_s32(vget_low_s32(vacc2x8), vget_high_s32(vacc2x8));
+    const int32x2_t vpsum2x9 = vadd_s32(vget_low_s32(vacc2x9), vget_high_s32(vacc2x9));
+    const int32x2_t vpsum2xA = vadd_s32(vget_low_s32(vacc2x10), vget_high_s32(vacc2x10));
+    const int32x2_t vpsum2xB = vadd_s32(vget_low_s32(vacc2x11), vget_high_s32(vacc2x11));
+    const int32x2_t vsum2x89 = vpadd_s32(vpsum2x8, vpsum2x9);
+    const int32x2_t vsum2xAB = vpadd_s32(vpsum2xA, vpsum2xB);
+    int32x4_t vacc2x89AB = vcombine_s32(vsum2x89, vsum2xAB );
+    const int32x2_t vpsum2xC = vadd_s32(vget_low_s32(vacc2x12), vget_high_s32(vacc2x12));
+    const int32x2_t vpsum2xD = vadd_s32(vget_low_s32(vacc2x13), vget_high_s32(vacc2x13));
+    const int32x2_t vpsum2xE = vadd_s32(vget_low_s32(vacc2x14), vget_high_s32(vacc2x14));
+    const int32x2_t vpsum2xF = vadd_s32(vget_low_s32(vacc2x15), vget_high_s32(vacc2x15));
+    const int32x2_t vsum2xCD = vpadd_s32(vpsum2xC, vpsum2xD);
+    const int32x2_t vsum2xEF = vpadd_s32(vpsum2xE, vpsum2xF);
+    int32x4_t vacc2xCDEF = vcombine_s32(vsum2xCD, vsum2xEF );
+    const int32x2_t vpsum3x0 = vadd_s32(vget_low_s32(vacc3x0), vget_high_s32(vacc3x0));
+    const int32x2_t vpsum3x1 = vadd_s32(vget_low_s32(vacc3x1), vget_high_s32(vacc3x1));
+    const int32x2_t vpsum3x2 = vadd_s32(vget_low_s32(vacc3x2), vget_high_s32(vacc3x2));
+    const int32x2_t vpsum3x3 = vadd_s32(vget_low_s32(vacc3x3), vget_high_s32(vacc3x3));
+    const int32x2_t vsum3x01 = vpadd_s32(vpsum3x0, vpsum3x1);
+    const int32x2_t vsum3x23 = vpadd_s32(vpsum3x2, vpsum3x3);
+    int32x4_t vacc3x0123 = vcombine_s32(vsum3x01, vsum3x23 );
+    const int32x2_t vpsum3x4 = vadd_s32(vget_low_s32(vacc3x4), vget_high_s32(vacc3x4));
+    const int32x2_t vpsum3x5 = vadd_s32(vget_low_s32(vacc3x5), vget_high_s32(vacc3x5));
+    const int32x2_t vpsum3x6 = vadd_s32(vget_low_s32(vacc3x6), vget_high_s32(vacc3x6));
+    const int32x2_t vpsum3x7 = vadd_s32(vget_low_s32(vacc3x7), vget_high_s32(vacc3x7));
+    const int32x2_t vsum3x45 = vpadd_s32(vpsum3x4, vpsum3x5);
+    const int32x2_t vsum3x67 = vpadd_s32(vpsum3x6, vpsum3x7);
+    int32x4_t vacc3x4567 = vcombine_s32(vsum3x45, vsum3x67 );
+    const int32x2_t vpsum3x8 = vadd_s32(vget_low_s32(vacc3x8), vget_high_s32(vacc3x8));
+    const int32x2_t vpsum3x9 = vadd_s32(vget_low_s32(vacc3x9), vget_high_s32(vacc3x9));
+    const int32x2_t vpsum3xA = vadd_s32(vget_low_s32(vacc3x10), vget_high_s32(vacc3x10));
+    const int32x2_t vpsum3xB = vadd_s32(vget_low_s32(vacc3x11), vget_high_s32(vacc3x11));
+    const int32x2_t vsum3x89 = vpadd_s32(vpsum3x8, vpsum3x9);
+    const int32x2_t vsum3xAB = vpadd_s32(vpsum3xA, vpsum3xB);
+    int32x4_t vacc3x89AB = vcombine_s32(vsum3x89, vsum3xAB );
+    const int32x2_t vpsum3xC = vadd_s32(vget_low_s32(vacc3x12), vget_high_s32(vacc3x12));
+    const int32x2_t vpsum3xD = vadd_s32(vget_low_s32(vacc3x13), vget_high_s32(vacc3x13));
+    const int32x2_t vpsum3xE = vadd_s32(vget_low_s32(vacc3x14), vget_high_s32(vacc3x14));
+    const int32x2_t vpsum3xF = vadd_s32(vget_low_s32(vacc3x15), vget_high_s32(vacc3x15));
+    const int32x2_t vsum3xCD = vpadd_s32(vpsum3xC, vpsum3xD);
+    const int32x2_t vsum3xEF = vpadd_s32(vpsum3xE, vpsum3xF);
+    int32x4_t vacc3xCDEF = vcombine_s32(vsum3xCD, vsum3xEF );
+#endif
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+    vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc1x89AB = vqrdmulhq_s32(vacc1x89AB, vmultiplier);
+    vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+    vacc2x89AB = vqrdmulhq_s32(vacc2x89AB, vmultiplier);
+    vacc2xCDEF = vqrdmulhq_s32(vacc2xCDEF, vmultiplier);
+    vacc3x0123 = vqrdmulhq_s32(vacc3x0123, vmultiplier);
+    vacc3x4567 = vqrdmulhq_s32(vacc3x4567, vmultiplier);
+    vacc3x89AB = vqrdmulhq_s32(vacc3x89AB, vmultiplier);
+    vacc3xCDEF = vqrdmulhq_s32(vacc3xCDEF, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+    vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc1x89AB = vsraq_n_s32(vacc1x89AB, vbicq_s32(vacc1x89AB, vzero_shift_mask), 31);
+    vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+    vacc2x89AB = vsraq_n_s32(vacc2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31);
+    vacc2xCDEF = vsraq_n_s32(vacc2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31);
+    vacc3x0123 = vsraq_n_s32(vacc3x0123, vbicq_s32(vacc3x0123, vzero_shift_mask), 31);
+    vacc3x4567 = vsraq_n_s32(vacc3x4567, vbicq_s32(vacc3x4567, vzero_shift_mask), 31);
+    vacc3x89AB = vsraq_n_s32(vacc3x89AB, vbicq_s32(vacc3x89AB, vzero_shift_mask), 31);
+    vacc3xCDEF = vsraq_n_s32(vacc3xCDEF, vbicq_s32(vacc3xCDEF, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_shift);
+    vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+    vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_shift);
+    vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_shift);
+    vacc3x0123 = vrshlq_s32(vacc3x0123, vright_shift);
+    vacc3x4567 = vrshlq_s32(vacc3x4567, vright_shift);
+    vacc3x89AB = vrshlq_s32(vacc3x89AB, vright_shift);
+    vacc3xCDEF = vrshlq_s32(vacc3xCDEF, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
+    const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), voutput_zero_point);
+    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+    int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
+    int8x16_t vout2x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc2x89ABCDEF);
+    int8x16_t vout3x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc3x01234567), vacc3x89ABCDEF);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567)), voutput_zero_point);
+    const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF)), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+    int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
+    int8x16_t vout2x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc2x89ABCDEF));
+    int8x16_t vout3x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc3x01234567), vqmovn_s16(vacc3x89ABCDEF));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+    vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
+    vout2x0123456789ABCDEF = vmaxq_s8(vout2x0123456789ABCDEF, voutput_min);
+    vout3x0123456789ABCDEF = vmaxq_s8(vout3x0123456789ABCDEF, voutput_min);
+
+    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+    vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
+    vout2x0123456789ABCDEF = vminq_s8(vout2x0123456789ABCDEF, voutput_max);
+    vout3x0123456789ABCDEF = vminq_s8(vout3x0123456789ABCDEF, voutput_max);
+
+    if (nc >= 16) {
+      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+      vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
+      vst1q_s8(c2 + 0, vout2x0123456789ABCDEF);
+      vst1q_s8(c3 + 0, vout3x0123456789ABCDEF);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+
+      nc -= 16;
+    } else {
+      int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
+      int8x16_t vout2x01234567_3x01234567 = vcombine_s8(vget_low_s8(vout2x0123456789ABCDEF), vget_low_s8(vout3x0123456789ABCDEF));
+      if (nc & 8) {
+        vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
+        vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
+        vst1_s8(c2, vget_low_s8(vout2x01234567_3x01234567)); c2 += 8;
+        vst1_s8(c3, vget_high_s8(vout2x01234567_3x01234567)); c3 += 8;
+        vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
+        vout2x01234567_3x01234567 = vcombine_s8(vget_high_s8(vout2x0123456789ABCDEF), vget_high_s8(vout3x0123456789ABCDEF));
+      }
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 0); c2 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c3, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 2); c3 += 4;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 0); c2 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c3, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 4); c3 += 2;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c2, vout2x01234567_3x01234567, 0);
+        vst1q_lane_s8(c3, vout2x01234567_3x01234567, 8);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-sse2-ld128.c b/src/qs8-gemm/gen/4x4c2-minmax-sse2-ld128.c
index 88b2a44..fc682f9 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-sse2-ld128.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-sse2-ld128.c
@@ -12,6 +12,7 @@
 #include <emmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld128(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -180,21 +182,6 @@
             _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
           vacc3x0123 = _mm_add_epi32(vacc3x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-          if (k > 6 * sizeof(int8_t)) {
-            const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-            const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-            w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-            vacc0x0123 = _mm_add_epi32(vacc0x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc1x0123 = _mm_add_epi32(vacc1x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc2x0123 = _mm_add_epi32(vacc2x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc3x0123 = _mm_add_epi32(vacc3x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-          }
         }
       }
     }
@@ -312,16 +299,16 @@
       vout = _mm_srli_si128(vout, 4);
       *((uint32_t*) c3) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
-      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
-      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
       c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-sse2-ld64.c b/src/qs8-gemm/gen/4x4c2-minmax-sse2-ld64.c
index bd28fe4..4e99dd2 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-sse2-ld64.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-sse2-ld64.c
@@ -12,6 +12,7 @@
 #include <emmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld64(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -180,21 +182,6 @@
             _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
           vacc3x0123 = _mm_add_epi32(vacc3x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-          if (k > 6 * sizeof(int8_t)) {
-            const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-            const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-            w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-            vacc0x0123 = _mm_add_epi32(vacc0x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc1x0123 = _mm_add_epi32(vacc1x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc2x0123 = _mm_add_epi32(vacc2x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc3x0123 = _mm_add_epi32(vacc3x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-          }
         }
       }
     }
@@ -312,16 +299,16 @@
       vout = _mm_srli_si128(vout, 4);
       *((uint32_t*) c3) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
-      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
-      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
       c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-sse41-ld128.c b/src/qs8-gemm/gen/4x4c2-minmax-sse41-ld128.c
index 8395f05..49935f0 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-sse41-ld128.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-sse41-ld128.c
@@ -12,6 +12,7 @@
 #include <smmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -180,21 +182,6 @@
             _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
           vacc3x0123 = _mm_add_epi32(vacc3x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-          if (k > 6 * sizeof(int8_t)) {
-            const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-            const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
-            w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-            vacc0x0123 = _mm_add_epi32(vacc0x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc1x0123 = _mm_add_epi32(vacc1x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc2x0123 = _mm_add_epi32(vacc2x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc3x0123 = _mm_add_epi32(vacc3x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-          }
         }
       }
     }
@@ -269,16 +256,16 @@
       *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
       *((uint32_t*) c3) = (uint32_t) _mm_extract_epi32(vout, 3);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
-      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
-      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
       c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-sse41-ld64.c b/src/qs8-gemm/gen/4x4c2-minmax-sse41-ld64.c
index c06b1f3..2e985a2 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-sse41-ld64.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-sse41-ld64.c
@@ -12,6 +12,7 @@
 #include <smmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -180,21 +182,6 @@
             _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
           vacc3x0123 = _mm_add_epi32(vacc3x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-          if (k > 6 * sizeof(int8_t)) {
-            const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-            const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
-            w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-            vacc0x0123 = _mm_add_epi32(vacc0x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc1x0123 = _mm_add_epi32(vacc1x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc2x0123 = _mm_add_epi32(vacc2x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc3x0123 = _mm_add_epi32(vacc3x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-          }
         }
       }
     }
@@ -269,16 +256,16 @@
       *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
       *((uint32_t*) c3) = (uint32_t) _mm_extract_epi32(vout, 3);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
-      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
-      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
       c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-ssse3-ld128.c b/src/qs8-gemm/gen/4x4c2-minmax-ssse3-ld128.c
index 0d91bcb..81ed872 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-ssse3-ld128.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-ssse3-ld128.c
@@ -12,6 +12,7 @@
 #include <tmmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld128(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -180,21 +182,6 @@
             _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
           vacc3x0123 = _mm_add_epi32(vacc3x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-          if (k > 6 * sizeof(int8_t)) {
-            const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-            const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-            w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-            vacc0x0123 = _mm_add_epi32(vacc0x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc1x0123 = _mm_add_epi32(vacc1x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc2x0123 = _mm_add_epi32(vacc2x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc3x0123 = _mm_add_epi32(vacc3x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-          }
         }
       }
     }
@@ -312,16 +299,16 @@
       vout = _mm_srli_si128(vout, 4);
       *((uint32_t*) c3) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
-      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
-      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
       c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-ssse3-ld64.c b/src/qs8-gemm/gen/4x4c2-minmax-ssse3-ld64.c
index e3c1141..053a947 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-ssse3-ld64.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-ssse3-ld64.c
@@ -12,6 +12,7 @@
 #include <tmmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld64(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -180,21 +182,6 @@
             _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
           vacc3x0123 = _mm_add_epi32(vacc3x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-          if (k > 6 * sizeof(int8_t)) {
-            const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-            const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-            w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-            vacc0x0123 = _mm_add_epi32(vacc0x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc1x0123 = _mm_add_epi32(vacc1x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc2x0123 = _mm_add_epi32(vacc2x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc3x0123 = _mm_add_epi32(vacc3x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-          }
         }
       }
     }
@@ -312,16 +299,16 @@
       vout = _mm_srli_si128(vout, 4);
       *((uint32_t*) c3) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
-      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
-      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
       c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-xop-ld128.c b/src/qs8-gemm/gen/4x4c2-minmax-xop-ld128.c
index 24f9fde..860d51b 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-xop-ld128.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-xop-ld128.c
@@ -17,6 +17,7 @@
 #endif
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128(
@@ -40,6 +41,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -185,21 +187,6 @@
             _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc2x0123);
           vacc3x0123 = _mm_maddd_epi16(
             _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc3x0123);
-
-          if (k > 6 * sizeof(int8_t)) {
-            const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-            const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
-            w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-            vacc0x0123 = _mm_maddd_epi16(
-              _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc0x0123);
-            vacc1x0123 = _mm_maddd_epi16(
-              _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc1x0123);
-            vacc2x0123 = _mm_maddd_epi16(
-              _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc2x0123);
-            vacc3x0123 = _mm_maddd_epi16(
-              _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc3x0123);
-          }
         }
       }
     }
@@ -274,16 +261,16 @@
       *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
       *((uint32_t*) c3) = (uint32_t) _mm_extract_epi32(vout, 3);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
-      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
-      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
       c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-xop-ld64.c b/src/qs8-gemm/gen/4x4c2-minmax-xop-ld64.c
index a06cd6c..5f89488 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-xop-ld64.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-xop-ld64.c
@@ -17,6 +17,7 @@
 #endif
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64(
@@ -40,6 +41,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -185,21 +187,6 @@
             _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc2x0123);
           vacc3x0123 = _mm_maddd_epi16(
             _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc3x0123);
-
-          if (k > 6 * sizeof(int8_t)) {
-            const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-            const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
-            w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-            vacc0x0123 = _mm_maddd_epi16(
-              _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc0x0123);
-            vacc1x0123 = _mm_maddd_epi16(
-              _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc1x0123);
-            vacc2x0123 = _mm_maddd_epi16(
-              _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc2x0123);
-            vacc3x0123 = _mm_maddd_epi16(
-              _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc3x0123);
-          }
         }
       }
     }
@@ -274,16 +261,16 @@
       *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
       *((uint32_t*) c3) = (uint32_t) _mm_extract_epi32(vout, 3);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
-      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
-      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
       c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/4x4c2-xw-minmax-sse2.c b/src/qs8-gemm/gen/4x4c2-xw-minmax-sse2.c
index 0b90db8..030105f 100644
--- a/src/qs8-gemm/gen/4x4c2-xw-minmax-sse2.c
+++ b/src/qs8-gemm/gen/4x4c2-xw-minmax-sse2.c
@@ -12,6 +12,7 @@
 #include <emmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse2(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -173,20 +175,6 @@
             _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
           vacc3x0123 = _mm_add_epi32(vacc3x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-          if (k > 6 * sizeof(int8_t)) {
-            const __m128i vxb3 = _mm_load_si128((const __m128i*) w);
-            w = (const void*) ((uintptr_t) w + 8 * sizeof(int16_t));
-
-            vacc0x0123 = _mm_add_epi32(vacc0x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc1x0123 = _mm_add_epi32(vacc1x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc2x0123 = _mm_add_epi32(vacc2x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc3x0123 = _mm_add_epi32(vacc3x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-          }
         }
       }
     }
@@ -304,16 +292,16 @@
       vout = _mm_srli_si128(vout, 4);
       *((uint32_t*) c3) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
-      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
-      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
       c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/4x4c2-xw-minmax-sse41.c b/src/qs8-gemm/gen/4x4c2-xw-minmax-sse41.c
index 100c932..d10630b 100644
--- a/src/qs8-gemm/gen/4x4c2-xw-minmax-sse41.c
+++ b/src/qs8-gemm/gen/4x4c2-xw-minmax-sse41.c
@@ -12,6 +12,7 @@
 #include <smmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -173,20 +175,6 @@
             _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
           vacc3x0123 = _mm_add_epi32(vacc3x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-          if (k > 6 * sizeof(int8_t)) {
-            const __m128i vxb3 = _mm_load_si128((const __m128i*) w);
-            w = (const void*) ((uintptr_t) w + 8 * sizeof(int16_t));
-
-            vacc0x0123 = _mm_add_epi32(vacc0x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc1x0123 = _mm_add_epi32(vacc1x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc2x0123 = _mm_add_epi32(vacc2x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc3x0123 = _mm_add_epi32(vacc3x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-          }
         }
       }
     }
@@ -261,16 +249,16 @@
       *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
       *((uint32_t*) c3) = (uint32_t) _mm_extract_epi32(vout, 3);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
-      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
-      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
       c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/4x4c2-xw-minmax-ssse3.c b/src/qs8-gemm/gen/4x4c2-xw-minmax-ssse3.c
index 4ae78f7..637a1ca 100644
--- a/src/qs8-gemm/gen/4x4c2-xw-minmax-ssse3.c
+++ b/src/qs8-gemm/gen/4x4c2-xw-minmax-ssse3.c
@@ -12,6 +12,7 @@
 #include <tmmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__ssse3(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -173,20 +175,6 @@
             _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
           vacc3x0123 = _mm_add_epi32(vacc3x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-          if (k > 6 * sizeof(int8_t)) {
-            const __m128i vxb3 = _mm_load_si128((const __m128i*) w);
-            w = (const void*) ((uintptr_t) w + 8 * sizeof(int16_t));
-
-            vacc0x0123 = _mm_add_epi32(vacc0x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc1x0123 = _mm_add_epi32(vacc1x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc2x0123 = _mm_add_epi32(vacc2x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc3x0123 = _mm_add_epi32(vacc3x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-          }
         }
       }
     }
@@ -304,16 +292,16 @@
       vout = _mm_srli_si128(vout, 4);
       *((uint32_t*) c3) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
-      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
-      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
       c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/4x4c2-xw-minmax-xop.c b/src/qs8-gemm/gen/4x4c2-xw-minmax-xop.c
index b6275df..471b856 100644
--- a/src/qs8-gemm/gen/4x4c2-xw-minmax-xop.c
+++ b/src/qs8-gemm/gen/4x4c2-xw-minmax-xop.c
@@ -17,6 +17,7 @@
 #endif
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop(
@@ -40,6 +41,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -178,20 +180,6 @@
             _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc2x0123);
           vacc3x0123 = _mm_maddd_epi16(
             _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc3x0123);
-
-          if (k > 6 * sizeof(int8_t)) {
-            const __m128i vxb3 = _mm_load_si128((const __m128i*) w);
-            w = (const void*) ((uintptr_t) w + 8 * sizeof(int16_t));
-
-            vacc0x0123 = _mm_maddd_epi16(
-              _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc0x0123);
-            vacc1x0123 = _mm_maddd_epi16(
-              _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc1x0123);
-            vacc2x0123 = _mm_maddd_epi16(
-              _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc2x0123);
-            vacc3x0123 = _mm_maddd_epi16(
-              _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc3x0123);
-          }
         }
       }
     }
@@ -266,16 +254,16 @@
       *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
       *((uint32_t*) c3) = (uint32_t) _mm_extract_epi32(vout, 3);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
-      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
-      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
       c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {
diff --git a/src/qs8-gemm/gen/4x8-minmax-neon-mlal-lane.c b/src/qs8-gemm/gen/4x8-minmax-neon-mlal-lane.c
new file mode 100644
index 0000000..2592771
--- /dev/null
+++ b/src/qs8-gemm/gen/4x8-minmax-neon-mlal-lane.c
@@ -0,0 +1,381 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/neon-mlal-lane.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/gemm.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride);
+  int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc1x0123 = vacc0x0123;
+    int32x4_t vacc1x4567 = vacc0x4567;
+    int32x4_t vacc2x0123 = vacc0x0123;
+    int32x4_t vacc2x4567 = vacc0x4567;
+    int32x4_t vacc3x0123 = vacc0x0123;
+    int32x4_t vacc3x4567 = vacc0x4567;
+
+    size_t k = kc;
+    while (k >= 8 * sizeof(int8_t)) {
+      const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+      const int16x8_t vxa0 = vmovl_s8(va0);
+      const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+      const int16x8_t vxa1 = vmovl_s8(va1);
+      const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+      const int16x8_t vxa2 = vmovl_s8(va2);
+      const int8x8_t va3 = vld1_s8(a3); a3 += 8;
+      const int16x8_t vxa3 = vmovl_s8(va3);
+
+      const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb01234567c0 = vmovl_s8(vb01234567c0);
+
+      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
+      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
+      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
+      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
+      vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0);
+      vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0);
+      const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb01234567c1 = vmovl_s8(vb01234567c1);
+
+      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
+      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
+      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
+      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
+      vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1);
+      vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1);
+      const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb01234567c2 = vmovl_s8(vb01234567c2);
+
+      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
+      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
+      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
+      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
+      vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2);
+      vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa3), 2);
+      const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb01234567c3 = vmovl_s8(vb01234567c3);
+
+      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
+      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
+      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
+      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
+      vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa3), 3);
+      vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa3), 3);
+
+      const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb01234567c4 = vmovl_s8(vb01234567c4);
+
+      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
+      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
+      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
+      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
+      vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa3), 0);
+      vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa3), 0);
+      const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb01234567c5 = vmovl_s8(vb01234567c5);
+
+      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
+      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
+      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
+      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
+      vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa3), 1);
+      vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa3), 1);
+      const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb01234567c6 = vmovl_s8(vb01234567c6);
+
+      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
+      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
+      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
+      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
+      vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa3), 2);
+      vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa3), 2);
+      const int8x8_t vb01234567c7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb01234567c7 = vmovl_s8(vb01234567c7);
+
+      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
+      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
+      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa1), 3);
+      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa1), 3);
+      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa2), 3);
+      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa2), 3);
+      vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa3), 3);
+      vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa3), 3);
+
+      k -= 8 * sizeof(int8_t);
+    }
+    if XNN_UNLIKELY(k != 0) {
+      const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+      const int16x8_t vxa0 = vmovl_s8(va0);
+      const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+      const int16x8_t vxa1 = vmovl_s8(va1);
+      const int8x8_t va2 = vld1_s8(a2); a2 = (const int8_t*) ((uintptr_t) a2 + k);
+      const int16x8_t vxa2 = vmovl_s8(va2);
+      const int8x8_t va3 = vld1_s8(a3); a3 = (const int8_t*) ((uintptr_t) a3 + k);
+      const int16x8_t vxa3 = vmovl_s8(va3);
+
+      const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vxb01234567c0 = vmovl_s8(vb01234567c0);
+
+      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
+      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
+      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
+      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
+      vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0);
+      vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0);
+
+      if (k >= 2 * sizeof(int8_t)) {
+        const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb01234567c1 = vmovl_s8(vb01234567c1);
+
+        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
+        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
+        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
+        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
+        vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1);
+        vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1);
+
+        if (k > 2 * sizeof(int8_t)) {
+          const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int16x8_t vxb01234567c2 = vmovl_s8(vb01234567c2);
+
+          vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+          vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+          vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
+          vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
+          vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
+          vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
+          vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2);
+          vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa3), 2);
+
+          if (k >= 4 * sizeof(int8_t)) {
+            const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            const int16x8_t vxb01234567c3 = vmovl_s8(vb01234567c3);
+
+            vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+            vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+            vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
+            vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
+            vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
+            vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
+            vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa3), 3);
+            vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa3), 3);
+
+            if (k > 4 * sizeof(int8_t)) {
+              const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+              const int16x8_t vxb01234567c4 = vmovl_s8(vb01234567c4);
+
+              vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+              vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+              vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
+              vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
+              vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
+              vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
+              vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa3), 0);
+              vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa3), 0);
+
+              if (k >= 6 * sizeof(int8_t)) {
+                const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+                const int16x8_t vxb01234567c5 = vmovl_s8(vb01234567c5);
+
+                vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+                vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+                vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
+                vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
+                vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
+                vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
+                vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa3), 1);
+                vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa3), 1);
+
+                if (k > 6 * sizeof(int8_t)) {
+                  const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+                  const int16x8_t vxb01234567c6 = vmovl_s8(vb01234567c6);
+
+                  vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+                  vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+                  vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
+                  vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
+                  vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
+                  vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
+                  vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa3), 2);
+                  vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa3), 2);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+    vacc3x0123 = vqrdmulhq_s32(vacc3x0123, vmultiplier);
+    vacc3x4567 = vqrdmulhq_s32(vacc3x4567, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+    vacc3x0123 = vsraq_n_s32(vacc3x0123, vbicq_s32(vacc3x0123, vzero_shift_mask), 31);
+    vacc3x4567 = vsraq_n_s32(vacc3x4567, vbicq_s32(vacc3x4567, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+    vacc3x0123 = vrshlq_s32(vacc3x0123, vright_shift);
+    vacc3x4567 = vrshlq_s32(vacc3x4567, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
+    int8x16_t vout2x01234567_3x01234567 = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc3x01234567);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567)), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
+    int8x16_t vout2x01234567_3x01234567 = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc3x01234567));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);
+    vout2x01234567_3x01234567 = vmaxq_s8(vout2x01234567_3x01234567, voutput_min);
+
+    vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);
+    vout2x01234567_3x01234567 = vminq_s8(vout2x01234567_3x01234567, voutput_max);
+
+    if (nc >= 8) {
+      vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
+      vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
+      vst1_s8(c2 + 0, vget_low_s8(vout2x01234567_3x01234567));
+      vst1_s8(c3 + 0, vget_high_s8(vout2x01234567_3x01234567));
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 0); c2 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c3, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 2); c3 += 4;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 0); c2 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c3, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 4); c3 += 2;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c2, vout2x01234567_3x01234567, 0);
+        vst1q_lane_s8(c3, vout2x01234567_3x01234567, 8);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/4x8-minmax-neon-mull-addw-dup.c b/src/qs8-gemm/gen/4x8-minmax-neon-mull-addw-dup.c
new file mode 100644
index 0000000..23b5f6f
--- /dev/null
+++ b/src/qs8-gemm/gen/4x8-minmax-neon-mull-addw-dup.c
@@ -0,0 +1,416 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/neon-mull-addw-dup.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/gemm.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride);
+  int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc1x0123 = vacc0x0123;
+    int32x4_t vacc1x4567 = vacc0x4567;
+    int32x4_t vacc2x0123 = vacc0x0123;
+    int32x4_t vacc2x4567 = vacc0x4567;
+    int32x4_t vacc3x0123 = vacc0x0123;
+    int32x4_t vacc3x4567 = vacc0x4567;
+
+    size_t k = kc;
+    while (k >= 8 * sizeof(int8_t)) {
+      const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+      const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+      const int8x8_t va3 = vld1_s8(a3); a3 += 8;
+
+      const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c0));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c0));
+      const int16x8_t vprod1x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va1, 0));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c0));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c0));
+      const int16x8_t vprod2x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va2, 0));
+      vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c0));
+      vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c0));
+      const int16x8_t vprod3x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va3, 0));
+      vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c0));
+      vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c0));
+      const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c1));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c1));
+      const int16x8_t vprod1x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va1, 1));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c1));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c1));
+      const int16x8_t vprod2x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va2, 1));
+      vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c1));
+      vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c1));
+      const int16x8_t vprod3x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va3, 1));
+      vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c1));
+      vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c1));
+      const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va0, 2));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c2));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c2));
+      const int16x8_t vprod1x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va1, 2));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c2));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c2));
+      const int16x8_t vprod2x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va2, 2));
+      vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c2));
+      vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c2));
+      const int16x8_t vprod3x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va3, 2));
+      vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c2));
+      vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c2));
+      const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3));
+      const int16x8_t vprod1x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va1, 3));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c3));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c3));
+      const int16x8_t vprod2x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va2, 3));
+      vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c3));
+      vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c3));
+      const int16x8_t vprod3x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va3, 3));
+      vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c3));
+      vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c3));
+      const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va0, 4));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c4));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c4));
+      const int16x8_t vprod1x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va1, 4));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c4));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c4));
+      const int16x8_t vprod2x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va2, 4));
+      vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c4));
+      vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c4));
+      const int16x8_t vprod3x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va3, 4));
+      vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c4));
+      vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c4));
+      const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va0, 5));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c5));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c5));
+      const int16x8_t vprod1x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va1, 5));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c5));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c5));
+      const int16x8_t vprod2x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va2, 5));
+      vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c5));
+      vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c5));
+      const int16x8_t vprod3x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va3, 5));
+      vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c5));
+      vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c5));
+      const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va0, 6));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c6));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c6));
+      const int16x8_t vprod1x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va1, 6));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c6));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c6));
+      const int16x8_t vprod2x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va2, 6));
+      vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c6));
+      vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c6));
+      const int16x8_t vprod3x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va3, 6));
+      vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c6));
+      vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c6));
+      const int8x8_t vb01234567c7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va0, 7));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c7));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c7));
+      const int16x8_t vprod1x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va1, 7));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c7));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c7));
+      const int16x8_t vprod2x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va2, 7));
+      vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c7));
+      vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c7));
+      const int16x8_t vprod3x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va3, 7));
+      vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c7));
+      vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c7));
+
+      k -= 8 * sizeof(int8_t);
+    }
+    if XNN_UNLIKELY(k != 0) {
+      const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+      const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+      const int8x8_t va2 = vld1_s8(a2); a2 = (const int8_t*) ((uintptr_t) a2 + k);
+      const int8x8_t va3 = vld1_s8(a3); a3 = (const int8_t*) ((uintptr_t) a3 + k);
+
+      const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0));
+      vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c0));
+      vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c0));
+      const int16x8_t vprod1x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va1, 0));
+      vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c0));
+      vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c0));
+      const int16x8_t vprod2x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va2, 0));
+      vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c0));
+      vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c0));
+      const int16x8_t vprod3x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va3, 0));
+      vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c0));
+      vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c0));
+
+      if (k >= 2 * sizeof(int8_t)) {
+        const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c1));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c1));
+        const int16x8_t vprod1x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va1, 1));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c1));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c1));
+        const int16x8_t vprod2x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va2, 1));
+        vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c1));
+        vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c1));
+        const int16x8_t vprod3x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va3, 1));
+        vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c1));
+        vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c1));
+
+        if (k > 2 * sizeof(int8_t)) {
+          const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          const int16x8_t vprod0x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va0, 2));
+          vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c2));
+          vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c2));
+          const int16x8_t vprod1x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va1, 2));
+          vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c2));
+          vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c2));
+          const int16x8_t vprod2x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va2, 2));
+          vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c2));
+          vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c2));
+          const int16x8_t vprod3x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va3, 2));
+          vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c2));
+          vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c2));
+
+          if (k >= 4 * sizeof(int8_t)) {
+            const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+            const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3));
+            vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3));
+            vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3));
+            const int16x8_t vprod1x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va1, 3));
+            vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c3));
+            vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c3));
+            const int16x8_t vprod2x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va2, 3));
+            vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c3));
+            vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c3));
+            const int16x8_t vprod3x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va3, 3));
+            vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c3));
+            vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c3));
+
+            if (k > 4 * sizeof(int8_t)) {
+              const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+              const int16x8_t vprod0x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va0, 4));
+              vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c4));
+              vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c4));
+              const int16x8_t vprod1x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va1, 4));
+              vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c4));
+              vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c4));
+              const int16x8_t vprod2x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va2, 4));
+              vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c4));
+              vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c4));
+              const int16x8_t vprod3x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va3, 4));
+              vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c4));
+              vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c4));
+
+              if (k >= 6 * sizeof(int8_t)) {
+                const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+                const int16x8_t vprod0x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va0, 5));
+                vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c5));
+                vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c5));
+                const int16x8_t vprod1x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va1, 5));
+                vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c5));
+                vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c5));
+                const int16x8_t vprod2x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va2, 5));
+                vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c5));
+                vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c5));
+                const int16x8_t vprod3x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va3, 5));
+                vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c5));
+                vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c5));
+
+                if (k > 6 * sizeof(int8_t)) {
+                  const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+                  const int16x8_t vprod0x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va0, 6));
+                  vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c6));
+                  vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c6));
+                  const int16x8_t vprod1x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va1, 6));
+                  vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c6));
+                  vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c6));
+                  const int16x8_t vprod2x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va2, 6));
+                  vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c6));
+                  vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c6));
+                  const int16x8_t vprod3x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va3, 6));
+                  vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c6));
+                  vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c6));
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+    vacc3x0123 = vqrdmulhq_s32(vacc3x0123, vmultiplier);
+    vacc3x4567 = vqrdmulhq_s32(vacc3x4567, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+    vacc3x0123 = vsraq_n_s32(vacc3x0123, vbicq_s32(vacc3x0123, vzero_shift_mask), 31);
+    vacc3x4567 = vsraq_n_s32(vacc3x4567, vbicq_s32(vacc3x4567, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+    vacc3x0123 = vrshlq_s32(vacc3x0123, vright_shift);
+    vacc3x4567 = vrshlq_s32(vacc3x4567, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
+    int8x16_t vout2x01234567_3x01234567 = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc3x01234567);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567)), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
+    int8x16_t vout2x01234567_3x01234567 = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc3x01234567));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);
+    vout2x01234567_3x01234567 = vmaxq_s8(vout2x01234567_3x01234567, voutput_min);
+
+    vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);
+    vout2x01234567_3x01234567 = vminq_s8(vout2x01234567_3x01234567, voutput_max);
+
+    if (nc >= 8) {
+      vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
+      vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
+      vst1_s8(c2 + 0, vget_low_s8(vout2x01234567_3x01234567));
+      vst1_s8(c3 + 0, vget_high_s8(vout2x01234567_3x01234567));
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 0); c2 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c3, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 2); c3 += 4;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 0); c2 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c3, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 4); c3 += 2;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c2, vout2x01234567_3x01234567, 0);
+        vst1q_lane_s8(c3, vout2x01234567_3x01234567, 8);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/4x8c16-minmax-neon-mlal-padal.c b/src/qs8-gemm/gen/4x8c16-minmax-neon-mlal-padal.c
new file mode 100644
index 0000000..45b6a5e
--- /dev/null
+++ b/src/qs8-gemm/gen/4x8c16-minmax-neon-mlal-padal.c
@@ -0,0 +1,396 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/c16-neon-mlal-padal.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 16);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride);
+  int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+
+  do {
+    int32x4_t vacc0x0 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x1 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x2 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x3 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x4 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x5 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc1x0 = vacc0x0;
+    int32x4_t vacc1x1 = vacc0x1;
+    int32x4_t vacc1x2 = vacc0x2;
+    int32x4_t vacc1x3 = vacc0x3;
+    int32x4_t vacc1x4 = vacc0x4;
+    int32x4_t vacc1x5 = vacc0x5;
+    int32x4_t vacc1x6 = vacc0x6;
+    int32x4_t vacc1x7 = vacc0x7;
+    int32x4_t vacc2x0 = vacc0x0;
+    int32x4_t vacc2x1 = vacc0x1;
+    int32x4_t vacc2x2 = vacc0x2;
+    int32x4_t vacc2x3 = vacc0x3;
+    int32x4_t vacc2x4 = vacc0x4;
+    int32x4_t vacc2x5 = vacc0x5;
+    int32x4_t vacc2x6 = vacc0x6;
+    int32x4_t vacc2x7 = vacc0x7;
+    int32x4_t vacc3x0 = vacc0x0;
+    int32x4_t vacc3x1 = vacc0x1;
+    int32x4_t vacc3x2 = vacc0x2;
+    int32x4_t vacc3x3 = vacc0x3;
+    int32x4_t vacc3x4 = vacc0x4;
+    int32x4_t vacc3x5 = vacc0x5;
+    int32x4_t vacc3x6 = vacc0x6;
+    int32x4_t vacc3x7 = vacc0x7;
+
+    // KC loop of 16
+    size_t k = 0;
+    while (k < kc) {
+      const int8x16_t va0 = vld1q_s8(a0); a0 += 16;
+      const int8x16_t va1 = vld1q_s8(a1); a1 += 16;
+      const int8x16_t va2 = vld1q_s8(a2); a2 += 16;
+      const int8x16_t va3 = vld1q_s8(a3); a3 += 16;
+
+      const int8x16_t vb0 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb1 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb2 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb3 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb4 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb5 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb6 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+      const int8x16_t vb7 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+
+      int16x8_t vprod0x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va0));
+      int16x8_t vprod1x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va1));
+      int16x8_t vprod2x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va2));
+      int16x8_t vprod3x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va3));
+      vprod0x0 = vmlal_s8(vprod0x0, vget_high_s8(vb0), vget_high_s8(va0));
+      vprod1x0 = vmlal_s8(vprod1x0, vget_high_s8(vb0), vget_high_s8(va1));
+      vprod2x0 = vmlal_s8(vprod2x0, vget_high_s8(vb0), vget_high_s8(va2));
+      vprod3x0 = vmlal_s8(vprod3x0, vget_high_s8(vb0), vget_high_s8(va3));
+      vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+      vacc1x0 = vpadalq_s16(vacc1x0, vprod1x0);
+      vacc2x0 = vpadalq_s16(vacc2x0, vprod2x0);
+      vacc3x0 = vpadalq_s16(vacc3x0, vprod3x0);
+      int16x8_t vprod0x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va0));
+      int16x8_t vprod1x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va1));
+      int16x8_t vprod2x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va2));
+      int16x8_t vprod3x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va3));
+      vprod0x1 = vmlal_s8(vprod0x1, vget_high_s8(vb1), vget_high_s8(va0));
+      vprod1x1 = vmlal_s8(vprod1x1, vget_high_s8(vb1), vget_high_s8(va1));
+      vprod2x1 = vmlal_s8(vprod2x1, vget_high_s8(vb1), vget_high_s8(va2));
+      vprod3x1 = vmlal_s8(vprod3x1, vget_high_s8(vb1), vget_high_s8(va3));
+      vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+      vacc1x1 = vpadalq_s16(vacc1x1, vprod1x1);
+      vacc2x1 = vpadalq_s16(vacc2x1, vprod2x1);
+      vacc3x1 = vpadalq_s16(vacc3x1, vprod3x1);
+      int16x8_t vprod0x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va0));
+      int16x8_t vprod1x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va1));
+      int16x8_t vprod2x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va2));
+      int16x8_t vprod3x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va3));
+      vprod0x2 = vmlal_s8(vprod0x2, vget_high_s8(vb2), vget_high_s8(va0));
+      vprod1x2 = vmlal_s8(vprod1x2, vget_high_s8(vb2), vget_high_s8(va1));
+      vprod2x2 = vmlal_s8(vprod2x2, vget_high_s8(vb2), vget_high_s8(va2));
+      vprod3x2 = vmlal_s8(vprod3x2, vget_high_s8(vb2), vget_high_s8(va3));
+      vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+      vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2);
+      vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2);
+      vacc3x2 = vpadalq_s16(vacc3x2, vprod3x2);
+      int16x8_t vprod0x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va0));
+      int16x8_t vprod1x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va1));
+      int16x8_t vprod2x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va2));
+      int16x8_t vprod3x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va3));
+      vprod0x3 = vmlal_s8(vprod0x3, vget_high_s8(vb3), vget_high_s8(va0));
+      vprod1x3 = vmlal_s8(vprod1x3, vget_high_s8(vb3), vget_high_s8(va1));
+      vprod2x3 = vmlal_s8(vprod2x3, vget_high_s8(vb3), vget_high_s8(va2));
+      vprod3x3 = vmlal_s8(vprod3x3, vget_high_s8(vb3), vget_high_s8(va3));
+      vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+      vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3);
+      vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3);
+      vacc3x3 = vpadalq_s16(vacc3x3, vprod3x3);
+      int16x8_t vprod0x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va0));
+      int16x8_t vprod1x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va1));
+      int16x8_t vprod2x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va2));
+      int16x8_t vprod3x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va3));
+      vprod0x4 = vmlal_s8(vprod0x4, vget_high_s8(vb4), vget_high_s8(va0));
+      vprod1x4 = vmlal_s8(vprod1x4, vget_high_s8(vb4), vget_high_s8(va1));
+      vprod2x4 = vmlal_s8(vprod2x4, vget_high_s8(vb4), vget_high_s8(va2));
+      vprod3x4 = vmlal_s8(vprod3x4, vget_high_s8(vb4), vget_high_s8(va3));
+      vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+      vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4);
+      vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4);
+      vacc3x4 = vpadalq_s16(vacc3x4, vprod3x4);
+      int16x8_t vprod0x5 = vmull_s8(vget_low_s8(vb5), vget_low_s8(va0));
+      int16x8_t vprod1x5 = vmull_s8(vget_low_s8(vb5), vget_low_s8(va1));
+      int16x8_t vprod2x5 = vmull_s8(vget_low_s8(vb5), vget_low_s8(va2));
+      int16x8_t vprod3x5 = vmull_s8(vget_low_s8(vb5), vget_low_s8(va3));
+      vprod0x5 = vmlal_s8(vprod0x5, vget_high_s8(vb5), vget_high_s8(va0));
+      vprod1x5 = vmlal_s8(vprod1x5, vget_high_s8(vb5), vget_high_s8(va1));
+      vprod2x5 = vmlal_s8(vprod2x5, vget_high_s8(vb5), vget_high_s8(va2));
+      vprod3x5 = vmlal_s8(vprod3x5, vget_high_s8(vb5), vget_high_s8(va3));
+      vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+      vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5);
+      vacc2x5 = vpadalq_s16(vacc2x5, vprod2x5);
+      vacc3x5 = vpadalq_s16(vacc3x5, vprod3x5);
+      int16x8_t vprod0x6 = vmull_s8(vget_low_s8(vb6), vget_low_s8(va0));
+      int16x8_t vprod1x6 = vmull_s8(vget_low_s8(vb6), vget_low_s8(va1));
+      int16x8_t vprod2x6 = vmull_s8(vget_low_s8(vb6), vget_low_s8(va2));
+      int16x8_t vprod3x6 = vmull_s8(vget_low_s8(vb6), vget_low_s8(va3));
+      vprod0x6 = vmlal_s8(vprod0x6, vget_high_s8(vb6), vget_high_s8(va0));
+      vprod1x6 = vmlal_s8(vprod1x6, vget_high_s8(vb6), vget_high_s8(va1));
+      vprod2x6 = vmlal_s8(vprod2x6, vget_high_s8(vb6), vget_high_s8(va2));
+      vprod3x6 = vmlal_s8(vprod3x6, vget_high_s8(vb6), vget_high_s8(va3));
+      vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+      vacc1x6 = vpadalq_s16(vacc1x6, vprod1x6);
+      vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6);
+      vacc3x6 = vpadalq_s16(vacc3x6, vprod3x6);
+      int16x8_t vprod0x7 = vmull_s8(vget_low_s8(vb7), vget_low_s8(va0));
+      int16x8_t vprod1x7 = vmull_s8(vget_low_s8(vb7), vget_low_s8(va1));
+      int16x8_t vprod2x7 = vmull_s8(vget_low_s8(vb7), vget_low_s8(va2));
+      int16x8_t vprod3x7 = vmull_s8(vget_low_s8(vb7), vget_low_s8(va3));
+      vprod0x7 = vmlal_s8(vprod0x7, vget_high_s8(vb7), vget_high_s8(va0));
+      vprod1x7 = vmlal_s8(vprod1x7, vget_high_s8(vb7), vget_high_s8(va1));
+      vprod2x7 = vmlal_s8(vprod2x7, vget_high_s8(vb7), vget_high_s8(va2));
+      vprod3x7 = vmlal_s8(vprod3x7, vget_high_s8(vb7), vget_high_s8(va3));
+      vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+      vacc1x7 = vpadalq_s16(vacc1x7, vprod1x7);
+      vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7);
+      vacc3x7 = vpadalq_s16(vacc3x7, vprod3x7);
+
+      k += 16 * sizeof(int8_t);
+    }
+
+#if XNN_ARCH_ARM64
+    const int32x4_t vsum0x01 = vpaddq_s32(vacc0x0, vacc0x1);
+    const int32x4_t vsum0x23 = vpaddq_s32(vacc0x2, vacc0x3);
+    const int32x4_t vsum0x45 = vpaddq_s32(vacc0x4, vacc0x5);
+    const int32x4_t vsum0x67 = vpaddq_s32(vacc0x6, vacc0x7);
+    const int32x4_t vsum1x01 = vpaddq_s32(vacc1x0, vacc1x1);
+    const int32x4_t vsum1x23 = vpaddq_s32(vacc1x2, vacc1x3);
+    const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5);
+    const int32x4_t vsum1x67 = vpaddq_s32(vacc1x6, vacc1x7);
+    const int32x4_t vsum2x01 = vpaddq_s32(vacc2x0, vacc2x1);
+    const int32x4_t vsum2x23 = vpaddq_s32(vacc2x2, vacc2x3);
+    const int32x4_t vsum2x45 = vpaddq_s32(vacc2x4, vacc2x5);
+    const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7);
+    const int32x4_t vsum3x01 = vpaddq_s32(vacc3x0, vacc3x1);
+    const int32x4_t vsum3x23 = vpaddq_s32(vacc3x2, vacc3x3);
+    const int32x4_t vsum3x45 = vpaddq_s32(vacc3x4, vacc3x5);
+    const int32x4_t vsum3x67 = vpaddq_s32(vacc3x6, vacc3x7);
+    int32x4_t vacc0x0123 = vpaddq_s32(vsum0x01, vsum0x23);
+    int32x4_t vacc0x4567 = vpaddq_s32(vsum0x45, vsum0x67);
+    int32x4_t vacc1x0123 = vpaddq_s32(vsum1x01, vsum1x23);
+    int32x4_t vacc1x4567 = vpaddq_s32(vsum1x45, vsum1x67);
+    int32x4_t vacc2x0123 = vpaddq_s32(vsum2x01, vsum2x23);
+    int32x4_t vacc2x4567 = vpaddq_s32(vsum2x45, vsum2x67);
+    int32x4_t vacc3x0123 = vpaddq_s32(vsum3x01, vsum3x23);
+    int32x4_t vacc3x4567 = vpaddq_s32(vsum3x45, vsum3x67);
+#else
+    const int32x2_t vpsum0x0 = vadd_s32(vget_low_s32(vacc0x0), vget_high_s32(vacc0x0));
+    const int32x2_t vpsum0x1 = vadd_s32(vget_low_s32(vacc0x1), vget_high_s32(vacc0x1));
+    const int32x2_t vpsum0x2 = vadd_s32(vget_low_s32(vacc0x2), vget_high_s32(vacc0x2));
+    const int32x2_t vpsum0x3 = vadd_s32(vget_low_s32(vacc0x3), vget_high_s32(vacc0x3));
+    const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1);
+    const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3);
+    int32x4_t vacc0x0123 = vcombine_s32(vsum0x01, vsum0x23 );
+    const int32x2_t vpsum0x4 = vadd_s32(vget_low_s32(vacc0x4), vget_high_s32(vacc0x4));
+    const int32x2_t vpsum0x5 = vadd_s32(vget_low_s32(vacc0x5), vget_high_s32(vacc0x5));
+    const int32x2_t vpsum0x6 = vadd_s32(vget_low_s32(vacc0x6), vget_high_s32(vacc0x6));
+    const int32x2_t vpsum0x7 = vadd_s32(vget_low_s32(vacc0x7), vget_high_s32(vacc0x7));
+    const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5);
+    const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7);
+    int32x4_t vacc0x4567 = vcombine_s32(vsum0x45, vsum0x67 );
+    const int32x2_t vpsum1x0 = vadd_s32(vget_low_s32(vacc1x0), vget_high_s32(vacc1x0));
+    const int32x2_t vpsum1x1 = vadd_s32(vget_low_s32(vacc1x1), vget_high_s32(vacc1x1));
+    const int32x2_t vpsum1x2 = vadd_s32(vget_low_s32(vacc1x2), vget_high_s32(vacc1x2));
+    const int32x2_t vpsum1x3 = vadd_s32(vget_low_s32(vacc1x3), vget_high_s32(vacc1x3));
+    const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1);
+    const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3);
+    int32x4_t vacc1x0123 = vcombine_s32(vsum1x01, vsum1x23 );
+    const int32x2_t vpsum1x4 = vadd_s32(vget_low_s32(vacc1x4), vget_high_s32(vacc1x4));
+    const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5));
+    const int32x2_t vpsum1x6 = vadd_s32(vget_low_s32(vacc1x6), vget_high_s32(vacc1x6));
+    const int32x2_t vpsum1x7 = vadd_s32(vget_low_s32(vacc1x7), vget_high_s32(vacc1x7));
+    const int32x2_t vsum1x45 = vpadd_s32(vpsum1x4, vpsum1x5);
+    const int32x2_t vsum1x67 = vpadd_s32(vpsum1x6, vpsum1x7);
+    int32x4_t vacc1x4567 = vcombine_s32(vsum1x45, vsum1x67 );
+    const int32x2_t vpsum2x0 = vadd_s32(vget_low_s32(vacc2x0), vget_high_s32(vacc2x0));
+    const int32x2_t vpsum2x1 = vadd_s32(vget_low_s32(vacc2x1), vget_high_s32(vacc2x1));
+    const int32x2_t vpsum2x2 = vadd_s32(vget_low_s32(vacc2x2), vget_high_s32(vacc2x2));
+    const int32x2_t vpsum2x3 = vadd_s32(vget_low_s32(vacc2x3), vget_high_s32(vacc2x3));
+    const int32x2_t vsum2x01 = vpadd_s32(vpsum2x0, vpsum2x1);
+    const int32x2_t vsum2x23 = vpadd_s32(vpsum2x2, vpsum2x3);
+    int32x4_t vacc2x0123 = vcombine_s32(vsum2x01, vsum2x23 );
+    const int32x2_t vpsum2x4 = vadd_s32(vget_low_s32(vacc2x4), vget_high_s32(vacc2x4));
+    const int32x2_t vpsum2x5 = vadd_s32(vget_low_s32(vacc2x5), vget_high_s32(vacc2x5));
+    const int32x2_t vpsum2x6 = vadd_s32(vget_low_s32(vacc2x6), vget_high_s32(vacc2x6));
+    const int32x2_t vpsum2x7 = vadd_s32(vget_low_s32(vacc2x7), vget_high_s32(vacc2x7));
+    const int32x2_t vsum2x45 = vpadd_s32(vpsum2x4, vpsum2x5);
+    const int32x2_t vsum2x67 = vpadd_s32(vpsum2x6, vpsum2x7);
+    int32x4_t vacc2x4567 = vcombine_s32(vsum2x45, vsum2x67 );
+    const int32x2_t vpsum3x0 = vadd_s32(vget_low_s32(vacc3x0), vget_high_s32(vacc3x0));
+    const int32x2_t vpsum3x1 = vadd_s32(vget_low_s32(vacc3x1), vget_high_s32(vacc3x1));
+    const int32x2_t vpsum3x2 = vadd_s32(vget_low_s32(vacc3x2), vget_high_s32(vacc3x2));
+    const int32x2_t vpsum3x3 = vadd_s32(vget_low_s32(vacc3x3), vget_high_s32(vacc3x3));
+    const int32x2_t vsum3x01 = vpadd_s32(vpsum3x0, vpsum3x1);
+    const int32x2_t vsum3x23 = vpadd_s32(vpsum3x2, vpsum3x3);
+    int32x4_t vacc3x0123 = vcombine_s32(vsum3x01, vsum3x23 );
+    const int32x2_t vpsum3x4 = vadd_s32(vget_low_s32(vacc3x4), vget_high_s32(vacc3x4));
+    const int32x2_t vpsum3x5 = vadd_s32(vget_low_s32(vacc3x5), vget_high_s32(vacc3x5));
+    const int32x2_t vpsum3x6 = vadd_s32(vget_low_s32(vacc3x6), vget_high_s32(vacc3x6));
+    const int32x2_t vpsum3x7 = vadd_s32(vget_low_s32(vacc3x7), vget_high_s32(vacc3x7));
+    const int32x2_t vsum3x45 = vpadd_s32(vpsum3x4, vpsum3x5);
+    const int32x2_t vsum3x67 = vpadd_s32(vpsum3x6, vpsum3x7);
+    int32x4_t vacc3x4567 = vcombine_s32(vsum3x45, vsum3x67 );
+#endif
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+    vacc3x0123 = vqrdmulhq_s32(vacc3x0123, vmultiplier);
+    vacc3x4567 = vqrdmulhq_s32(vacc3x4567, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+    vacc3x0123 = vsraq_n_s32(vacc3x0123, vbicq_s32(vacc3x0123, vzero_shift_mask), 31);
+    vacc3x4567 = vsraq_n_s32(vacc3x4567, vbicq_s32(vacc3x4567, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+    vacc3x0123 = vrshlq_s32(vacc3x0123, vright_shift);
+    vacc3x4567 = vrshlq_s32(vacc3x4567, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
+    int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
+    int8x16_t vout2x01234567_3x01234567 = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc3x01234567);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567)), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
+    int8x16_t vout2x01234567_3x01234567 = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc3x01234567));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);
+    vout2x01234567_3x01234567 = vmaxq_s8(vout2x01234567_3x01234567, voutput_min);
+
+    vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);
+    vout2x01234567_3x01234567 = vminq_s8(vout2x01234567_3x01234567, voutput_max);
+
+    if (nc >= 8) {
+      vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
+      vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
+      vst1_s8(c2 + 0, vget_low_s8(vout2x01234567_3x01234567));
+      vst1_s8(c3 + 0, vget_high_s8(vout2x01234567_3x01234567));
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 0); c2 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c3, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 2); c3 += 4;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 0); c2 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c3, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 4); c3 += 2;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c2, vout2x01234567_3x01234567, 0);
+        vst1q_lane_s8(c3, vout2x01234567_3x01234567, 8);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/4x8c2-minmax-neon-mlal-padal-dup.c b/src/qs8-gemm/gen/4x8c2-minmax-neon-mlal-padal-dup.c
new file mode 100644
index 0000000..ddb72c8
--- /dev/null
+++ b/src/qs8-gemm/gen/4x8c2-minmax-neon-mlal-padal-dup.c
@@ -0,0 +1,454 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/c2-neon-mull-padal-dup.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride);
+  int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc1x0123 = vacc0x0123;
+    int32x4_t vacc1x4567 = vacc0x4567;
+    int32x4_t vacc2x0123 = vacc0x0123;
+    int32x4_t vacc2x4567 = vacc0x4567;
+    int32x4_t vacc3x0123 = vacc0x0123;
+    int32x4_t vacc3x4567 = vacc0x4567;
+
+    size_t k = kc;
+
+    while (k >= 16 * sizeof(int8_t)) {
+      const int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va0x1 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va1x0 = vld1_s8(a1); a1 += 8;
+      const int8x8_t va1x1 = vld1_s8(a1); a1 += 8;
+      const int8x8_t va2x0 = vld1_s8(a2); a2 += 8;
+      const int8x8_t va2x1 = vld1_s8(a2); a2 += 8;
+      const int8x8_t va3x0 = vld1_s8(a3); a3 += 8;
+      const int8x8_t va3x1 = vld1_s8(a3); a3 += 8;
+
+      const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+      int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+      int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0)));
+      int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 0)));
+      const int8x8_t vb0123c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+      vprod1x0123c0 = vmlal_s8(vprod1x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+      vprod2x0123c0 = vmlal_s8(vprod2x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 0)));
+      vprod3x0123c0 = vmlal_s8(vprod3x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 0)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+      vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c0);
+      int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+      int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+      int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0)));
+      int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 0)));
+      const int8x8_t vb4567c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+      vprod1x4567c0 = vmlal_s8(vprod1x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+      vprod2x4567c0 = vmlal_s8(vprod2x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 0)));
+      vprod3x4567c0 = vmlal_s8(vprod3x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 0)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+      vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c0);
+      int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+      int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+      int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1)));
+      int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 1)));
+      const int8x8_t vb0123c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x0123c1 = vmlal_s8(vprod0x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+      vprod1x0123c1 = vmlal_s8(vprod1x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+      vprod2x0123c1 = vmlal_s8(vprod2x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 1)));
+      vprod3x0123c1 = vmlal_s8(vprod3x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 1)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+      vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c1);
+      int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+      int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+      int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1)));
+      int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 1)));
+      const int8x8_t vb4567c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x4567c1 = vmlal_s8(vprod0x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+      vprod1x4567c1 = vmlal_s8(vprod1x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+      vprod2x4567c1 = vmlal_s8(vprod2x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 1)));
+      vprod3x4567c1 = vmlal_s8(vprod3x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 1)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+      vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c1);
+      int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+      int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+      int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 2)));
+      int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 2)));
+      const int8x8_t vb0123c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x0123c2 = vmlal_s8(vprod0x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+      vprod1x0123c2 = vmlal_s8(vprod1x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+      vprod2x0123c2 = vmlal_s8(vprod2x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 2)));
+      vprod3x0123c2 = vmlal_s8(vprod3x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 2)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+      vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c2);
+      int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+      int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+      int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 2)));
+      int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 2)));
+      const int8x8_t vb4567c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x4567c2 = vmlal_s8(vprod0x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+      vprod1x4567c2 = vmlal_s8(vprod1x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+      vprod2x4567c2 = vmlal_s8(vprod2x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 2)));
+      vprod3x4567c2 = vmlal_s8(vprod3x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 2)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+      vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c2);
+      int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+      int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+      int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 3)));
+      int16x8_t vprod3x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 3)));
+      const int8x8_t vb0123c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x0123c3 = vmlal_s8(vprod0x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+      vprod1x0123c3 = vmlal_s8(vprod1x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+      vprod2x0123c3 = vmlal_s8(vprod2x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 3)));
+      vprod3x0123c3 = vmlal_s8(vprod3x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 3)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
+      vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c3);
+      int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+      int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+      int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 3)));
+      int16x8_t vprod3x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 3)));
+      const int8x8_t vb4567c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      vprod0x4567c3 = vmlal_s8(vprod0x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+      vprod1x4567c3 = vmlal_s8(vprod1x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+      vprod2x4567c3 = vmlal_s8(vprod2x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 3)));
+      vprod3x4567c3 = vmlal_s8(vprod3x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 3)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
+      vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c3);
+
+      k -= 16 * sizeof(int8_t);
+    }
+
+    if (k >= 8 * sizeof(int8_t)) {
+      const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+      const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+      const int8x8_t va3 = vld1_s8(a3); a3 += 8;
+
+      const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+      const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+      const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+      const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+      const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+      const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+      const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+      const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+      const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+      const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+      const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+      const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+      const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+      const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+      const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+      const int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
+      const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+      const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+      const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+      const int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
+      const int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+      const int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+      const int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+      const int16x8_t vprod3x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
+      vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c0);
+      vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c1);
+      vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c2);
+      vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c3);
+      const int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+      const int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+      const int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+      const int16x8_t vprod3x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
+      vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c0);
+      vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c1);
+      vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c2);
+      vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c3);
+
+      k -= 8 * sizeof(int8_t);
+    }
+
+    if XNN_UNLIKELY(k != 0) {
+      const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+      const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+      const int8x8_t va2 = vld1_s8(a2); a2 = (const int8_t*) ((uintptr_t) a2 + k);
+      const int8x8_t va3 = vld1_s8(a3); a3 = (const int8_t*) ((uintptr_t) a3 + k);
+
+      const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+      const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+      const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+      const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+      const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+      const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+      const int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+      vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c0);
+      const int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+      vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c0);
+
+      if (k > 2 * sizeof(int8_t)) {
+        const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+        const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+        const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+        const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+        const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+        const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+        const int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+        vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c1);
+        const int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+        vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c1);
+
+        if (k > 4 * sizeof(int8_t)) {
+          const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+          vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+          const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+          vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+          const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+          vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+          const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+          vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+          const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+          vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+          const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+          vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+          const int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+          vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c2);
+          const int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+          vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c2);
+        }
+      }
+    }
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+    vacc3x0123 = vqrdmulhq_s32(vacc3x0123, vmultiplier);
+    vacc3x4567 = vqrdmulhq_s32(vacc3x4567, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+    vacc3x0123 = vsraq_n_s32(vacc3x0123, vbicq_s32(vacc3x0123, vzero_shift_mask), 31);
+    vacc3x4567 = vsraq_n_s32(vacc3x4567, vbicq_s32(vacc3x4567, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+    vacc3x0123 = vrshlq_s32(vacc3x0123, vright_shift);
+    vacc3x4567 = vrshlq_s32(vacc3x4567, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
+    int8x16_t vout2x01234567_3x01234567 = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc3x01234567);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567)), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
+    int8x16_t vout2x01234567_3x01234567 = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc3x01234567));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);
+    vout2x01234567_3x01234567 = vmaxq_s8(vout2x01234567_3x01234567, voutput_min);
+
+    vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);
+    vout2x01234567_3x01234567 = vminq_s8(vout2x01234567_3x01234567, voutput_max);
+
+    if (nc >= 8) {
+      vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
+      vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
+      vst1_s8(c2 + 0, vget_low_s8(vout2x01234567_3x01234567));
+      vst1_s8(c3 + 0, vget_high_s8(vout2x01234567_3x01234567));
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 0); c2 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c3, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 2); c3 += 4;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 0); c2 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c3, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 4); c3 += 2;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c2, vout2x01234567_3x01234567, 0);
+        vst1q_lane_s8(c3, vout2x01234567_3x01234567, 8);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/4x8c2-minmax-neon-mull-padal-dup.c b/src/qs8-gemm/gen/4x8c2-minmax-neon-mull-padal-dup.c
new file mode 100644
index 0000000..a7f10cd
--- /dev/null
+++ b/src/qs8-gemm/gen/4x8c2-minmax-neon-mull-padal-dup.c
@@ -0,0 +1,328 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/c2-neon-mull-padal-dup.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mull_padal_dup(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride);
+  int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc1x0123 = vacc0x0123;
+    int32x4_t vacc1x4567 = vacc0x4567;
+    int32x4_t vacc2x0123 = vacc0x0123;
+    int32x4_t vacc2x4567 = vacc0x4567;
+    int32x4_t vacc3x0123 = vacc0x0123;
+    int32x4_t vacc3x4567 = vacc0x4567;
+
+    size_t k = kc;
+
+
+    while (k >= 8 * sizeof(int8_t)) {
+      const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+      const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+      const int8x8_t va3 = vld1_s8(a3); a3 += 8;
+
+      const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+      const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+      const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+      const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+      const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+      const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+      const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+      const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+      const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+      const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+      const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+      const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+      const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+      const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+      const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+      const int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
+      const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+      const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+      const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+      const int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
+      const int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+      const int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+      const int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+      const int16x8_t vprod3x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
+      vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c0);
+      vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c1);
+      vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c2);
+      vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c3);
+      const int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+      const int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+      const int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+      const int16x8_t vprod3x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
+      vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c0);
+      vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c1);
+      vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c2);
+      vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c3);
+
+      k -= 8 * sizeof(int8_t);
+    }
+
+    if XNN_UNLIKELY(k != 0) {
+      const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+      const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+      const int8x8_t va2 = vld1_s8(a2); a2 = (const int8_t*) ((uintptr_t) a2 + k);
+      const int8x8_t va3 = vld1_s8(a3); a3 = (const int8_t*) ((uintptr_t) a3 + k);
+
+      const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+      const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+      const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+      const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+      const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+      vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+      const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+      vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+      const int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+      vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c0);
+      const int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+      vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c0);
+
+      if (k > 2 * sizeof(int8_t)) {
+        const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+        const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+        const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+        const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+        const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+        const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+        const int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+        vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c1);
+        const int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+        vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c1);
+
+        if (k > 4 * sizeof(int8_t)) {
+          const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+          vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+          const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+          vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+          const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+          vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+          const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+          vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+          const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+          vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+          const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+          vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+          const int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+          vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c2);
+          const int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+          vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c2);
+        }
+      }
+    }
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+    vacc3x0123 = vqrdmulhq_s32(vacc3x0123, vmultiplier);
+    vacc3x4567 = vqrdmulhq_s32(vacc3x4567, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+    vacc3x0123 = vsraq_n_s32(vacc3x0123, vbicq_s32(vacc3x0123, vzero_shift_mask), 31);
+    vacc3x4567 = vsraq_n_s32(vacc3x4567, vbicq_s32(vacc3x4567, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+    vacc3x0123 = vrshlq_s32(vacc3x0123, vright_shift);
+    vacc3x4567 = vrshlq_s32(vacc3x4567, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
+    int8x16_t vout2x01234567_3x01234567 = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc3x01234567);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567)), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
+    int8x16_t vout2x01234567_3x01234567 = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc3x01234567));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);
+    vout2x01234567_3x01234567 = vmaxq_s8(vout2x01234567_3x01234567, voutput_min);
+
+    vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);
+    vout2x01234567_3x01234567 = vminq_s8(vout2x01234567_3x01234567, voutput_max);
+
+    if (nc >= 8) {
+      vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
+      vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
+      vst1_s8(c2 + 0, vget_low_s8(vout2x01234567_3x01234567));
+      vst1_s8(c3 + 0, vget_high_s8(vout2x01234567_3x01234567));
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 0); c2 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c3, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 2); c3 += 4;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 0); c2 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c3, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 4); c3 += 2;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c2, vout2x01234567_3x01234567, 0);
+        vst1q_lane_s8(c3, vout2x01234567_3x01234567, 8);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/4x8c4-minmax-neondot.c b/src/qs8-gemm/gen/4x8c4-minmax-neondot.c
index c1199f5..f733070 100644
--- a/src/qs8-gemm/gen/4x8c4-minmax-neondot.c
+++ b/src/qs8-gemm/gen/4x8c4-minmax-neondot.c
@@ -7,18 +7,14 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-
 #include <assert.h>
 
 #include <arm_neon.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
-// This kernel uses ARMv8.2 dot-product instructions.
-//
-// Scalar model: xnn_qs8_gemm_minmax_ukernel_4x8c4__scalar. Refer to
-// that kernel for more comments.
 void xnn_qs8_gemm_minmax_ukernel_4x8c4__neondot(
     size_t mr,
     size_t nc,
@@ -34,7 +30,12 @@
   assert(mr <= 4);
   assert(nc != 0);
   assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
 
+  kc = round_up_po2(kc, 4);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -80,10 +81,10 @@
       const int8x8_t va3x01234567 = vld1_s8(a3); a3 += 8;
 
       // Load a 8x8 block of weights.
-      const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
+      const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
 
       // Multiply-accumulate: 4x8 * 8x8 --> 4x8.
       vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb0123x0123, va0x01234567, 0);
@@ -105,17 +106,17 @@
 
       k -= 8 * sizeof(int8_t);
     }
-    // Handle up to 7 final positions of `k`
+    // Handle up to 4 final positions of `k`
     if XNN_UNLIKELY(k != 0) {
       // Load a 4x4 block of activations.
-      const int8x8_t va0x01234567 = vld1_s8(a0); a0 += k;
-      const int8x8_t va1x01234567 = vld1_s8(a1); a1 += k;
-      const int8x8_t va2x01234567 = vld1_s8(a2); a2 += k;
-      const int8x8_t va3x01234567 = vld1_s8(a3); a3 += k;
+      const int8x8_t va0x01234567 = vld1_s8(a0); a0 += 4;
+      const int8x8_t va1x01234567 = vld1_s8(a1); a1 += 4;
+      const int8x8_t va2x01234567 = vld1_s8(a2); a2 += 4;
+      const int8x8_t va3x01234567 = vld1_s8(a3); a3 += 4;
 
       // Load a 4x8 block of weights.
-      const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
+      const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
 
       // Multiply-accumulate: 4x4 * 4x8 --> 4x8.
       vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb0123x0123, va0x01234567, 0);
@@ -126,32 +127,9 @@
       vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb0123x4567, va2x01234567, 0);
       vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb0123x0123, va3x01234567, 0);
       vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb0123x4567, va3x01234567, 0);
-
-      if (k > 4) {
-        // Load a 4x8 block of weights.
-        const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-
-        // Multiply-accumulate: 4x4 * 4x8 --> 4x8.
-        vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
-        vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
-        vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1);
-        vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1);
-        vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1);
-        vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1);
-        vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb4567x0123, va3x01234567, 1);
-        vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb4567x4567, va3x01234567, 1);
-      }
     }
-    // End of accumulation loop. The variable `kc` contains the amount by which
-    // we advanced the `va` pointers, so we rewind by this amount now.
-    a0 = (const int8_t*)((uintptr_t)a0 - kc);
-    a1 = (const int8_t*)((uintptr_t)a1 - kc);
-    a2 = (const int8_t*)((uintptr_t)a2 - kc);
-    a3 = (const int8_t*)((uintptr_t)a3 - kc);
 
     // Post-accumulation work
-
     const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
     const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
 
@@ -222,6 +200,11 @@
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
       c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+
       nc -= 8;
     } else {
       // Final case where not all of the 8 columns fit in the destination.
diff --git a/src/qs8-gemm/gen/4x8c8-minmax-neon-mlal-padal.c b/src/qs8-gemm/gen/4x8c8-minmax-neon-mlal-padal.c
new file mode 100644
index 0000000..ac9501d
--- /dev/null
+++ b/src/qs8-gemm/gen/4x8c8-minmax-neon-mlal-padal.c
@@ -0,0 +1,491 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/c8-neon-mull-padal.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride);
+  int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+
+  do {
+    int32x4_t vacc0x0 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x1 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x2 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x3 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x4 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x5 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc1x0 = vacc0x0;
+    int32x4_t vacc1x1 = vacc0x1;
+    int32x4_t vacc1x2 = vacc0x2;
+    int32x4_t vacc1x3 = vacc0x3;
+    int32x4_t vacc1x4 = vacc0x4;
+    int32x4_t vacc1x5 = vacc0x5;
+    int32x4_t vacc1x6 = vacc0x6;
+    int32x4_t vacc1x7 = vacc0x7;
+    int32x4_t vacc2x0 = vacc0x0;
+    int32x4_t vacc2x1 = vacc0x1;
+    int32x4_t vacc2x2 = vacc0x2;
+    int32x4_t vacc2x3 = vacc0x3;
+    int32x4_t vacc2x4 = vacc0x4;
+    int32x4_t vacc2x5 = vacc0x5;
+    int32x4_t vacc2x6 = vacc0x6;
+    int32x4_t vacc2x7 = vacc0x7;
+    int32x4_t vacc3x0 = vacc0x0;
+    int32x4_t vacc3x1 = vacc0x1;
+    int32x4_t vacc3x2 = vacc0x2;
+    int32x4_t vacc3x3 = vacc0x3;
+    int32x4_t vacc3x4 = vacc0x4;
+    int32x4_t vacc3x5 = vacc0x5;
+    int32x4_t vacc3x6 = vacc0x6;
+    int32x4_t vacc3x7 = vacc0x7;
+
+    size_t k = kc;
+    // 2x partial unrolled loop to load 16 bytes at a time using MLA.
+    while (k >= 16 * sizeof(int8_t)) {
+      const int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va0x1 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va1x0 = vld1_s8(a1); a1 += 8;
+      const int8x8_t va1x1 = vld1_s8(a1); a1 += 8;
+      const int8x8_t va2x0 = vld1_s8(a2); a2 += 8;
+      const int8x8_t va2x1 = vld1_s8(a2); a2 += 8;
+      const int8x8_t va3x0 = vld1_s8(a3); a3 += 8;
+      const int8x8_t va3x1 = vld1_s8(a3); a3 += 8;
+
+      const int8x8_t vb0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb4x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb5x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb6x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      const int8x8_t vb7x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+
+      const int8x8_t vb0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x0 = vmull_s8(vb0x0, va0x0);
+      int16x8_t vprod1x0 = vmull_s8(vb0x0, va1x0);
+      int16x8_t vprod2x0 = vmull_s8(vb0x0, va2x0);
+      int16x8_t vprod3x0 = vmull_s8(vb0x0, va3x0);
+      vprod0x0 = vmlal_s8(vprod0x0, vb0x1, va0x1);
+      vprod1x0 = vmlal_s8(vprod1x0, vb0x1, va1x1);
+      vprod2x0 = vmlal_s8(vprod2x0, vb0x1, va2x1);
+      vprod3x0 = vmlal_s8(vprod3x0, vb0x1, va3x1);
+      vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+      vacc1x0 = vpadalq_s16(vacc1x0, vprod1x0);
+      vacc2x0 = vpadalq_s16(vacc2x0, vprod2x0);
+      vacc3x0 = vpadalq_s16(vacc3x0, vprod3x0);
+      const int8x8_t vb1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x1 = vmull_s8(vb1x0, va0x0);
+      int16x8_t vprod1x1 = vmull_s8(vb1x0, va1x0);
+      int16x8_t vprod2x1 = vmull_s8(vb1x0, va2x0);
+      int16x8_t vprod3x1 = vmull_s8(vb1x0, va3x0);
+      vprod0x1 = vmlal_s8(vprod0x1, vb1x1, va0x1);
+      vprod1x1 = vmlal_s8(vprod1x1, vb1x1, va1x1);
+      vprod2x1 = vmlal_s8(vprod2x1, vb1x1, va2x1);
+      vprod3x1 = vmlal_s8(vprod3x1, vb1x1, va3x1);
+      vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+      vacc1x1 = vpadalq_s16(vacc1x1, vprod1x1);
+      vacc2x1 = vpadalq_s16(vacc2x1, vprod2x1);
+      vacc3x1 = vpadalq_s16(vacc3x1, vprod3x1);
+      const int8x8_t vb2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x2 = vmull_s8(vb2x0, va0x0);
+      int16x8_t vprod1x2 = vmull_s8(vb2x0, va1x0);
+      int16x8_t vprod2x2 = vmull_s8(vb2x0, va2x0);
+      int16x8_t vprod3x2 = vmull_s8(vb2x0, va3x0);
+      vprod0x2 = vmlal_s8(vprod0x2, vb2x1, va0x1);
+      vprod1x2 = vmlal_s8(vprod1x2, vb2x1, va1x1);
+      vprod2x2 = vmlal_s8(vprod2x2, vb2x1, va2x1);
+      vprod3x2 = vmlal_s8(vprod3x2, vb2x1, va3x1);
+      vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+      vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2);
+      vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2);
+      vacc3x2 = vpadalq_s16(vacc3x2, vprod3x2);
+      const int8x8_t vb3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x3 = vmull_s8(vb3x0, va0x0);
+      int16x8_t vprod1x3 = vmull_s8(vb3x0, va1x0);
+      int16x8_t vprod2x3 = vmull_s8(vb3x0, va2x0);
+      int16x8_t vprod3x3 = vmull_s8(vb3x0, va3x0);
+      vprod0x3 = vmlal_s8(vprod0x3, vb3x1, va0x1);
+      vprod1x3 = vmlal_s8(vprod1x3, vb3x1, va1x1);
+      vprod2x3 = vmlal_s8(vprod2x3, vb3x1, va2x1);
+      vprod3x3 = vmlal_s8(vprod3x3, vb3x1, va3x1);
+      vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+      vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3);
+      vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3);
+      vacc3x3 = vpadalq_s16(vacc3x3, vprod3x3);
+      const int8x8_t vb4x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x4 = vmull_s8(vb4x0, va0x0);
+      int16x8_t vprod1x4 = vmull_s8(vb4x0, va1x0);
+      int16x8_t vprod2x4 = vmull_s8(vb4x0, va2x0);
+      int16x8_t vprod3x4 = vmull_s8(vb4x0, va3x0);
+      vprod0x4 = vmlal_s8(vprod0x4, vb4x1, va0x1);
+      vprod1x4 = vmlal_s8(vprod1x4, vb4x1, va1x1);
+      vprod2x4 = vmlal_s8(vprod2x4, vb4x1, va2x1);
+      vprod3x4 = vmlal_s8(vprod3x4, vb4x1, va3x1);
+      vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+      vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4);
+      vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4);
+      vacc3x4 = vpadalq_s16(vacc3x4, vprod3x4);
+      const int8x8_t vb5x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x5 = vmull_s8(vb5x0, va0x0);
+      int16x8_t vprod1x5 = vmull_s8(vb5x0, va1x0);
+      int16x8_t vprod2x5 = vmull_s8(vb5x0, va2x0);
+      int16x8_t vprod3x5 = vmull_s8(vb5x0, va3x0);
+      vprod0x5 = vmlal_s8(vprod0x5, vb5x1, va0x1);
+      vprod1x5 = vmlal_s8(vprod1x5, vb5x1, va1x1);
+      vprod2x5 = vmlal_s8(vprod2x5, vb5x1, va2x1);
+      vprod3x5 = vmlal_s8(vprod3x5, vb5x1, va3x1);
+      vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+      vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5);
+      vacc2x5 = vpadalq_s16(vacc2x5, vprod2x5);
+      vacc3x5 = vpadalq_s16(vacc3x5, vprod3x5);
+      const int8x8_t vb6x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x6 = vmull_s8(vb6x0, va0x0);
+      int16x8_t vprod1x6 = vmull_s8(vb6x0, va1x0);
+      int16x8_t vprod2x6 = vmull_s8(vb6x0, va2x0);
+      int16x8_t vprod3x6 = vmull_s8(vb6x0, va3x0);
+      vprod0x6 = vmlal_s8(vprod0x6, vb6x1, va0x1);
+      vprod1x6 = vmlal_s8(vprod1x6, vb6x1, va1x1);
+      vprod2x6 = vmlal_s8(vprod2x6, vb6x1, va2x1);
+      vprod3x6 = vmlal_s8(vprod3x6, vb6x1, va3x1);
+      vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+      vacc1x6 = vpadalq_s16(vacc1x6, vprod1x6);
+      vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6);
+      vacc3x6 = vpadalq_s16(vacc3x6, vprod3x6);
+      const int8x8_t vb7x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+      int16x8_t vprod0x7 = vmull_s8(vb7x0, va0x0);
+      int16x8_t vprod1x7 = vmull_s8(vb7x0, va1x0);
+      int16x8_t vprod2x7 = vmull_s8(vb7x0, va2x0);
+      int16x8_t vprod3x7 = vmull_s8(vb7x0, va3x0);
+      vprod0x7 = vmlal_s8(vprod0x7, vb7x1, va0x1);
+      vprod1x7 = vmlal_s8(vprod1x7, vb7x1, va1x1);
+      vprod2x7 = vmlal_s8(vprod2x7, vb7x1, va2x1);
+      vprod3x7 = vmlal_s8(vprod3x7, vb7x1, va3x1);
+      vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+      vacc1x7 = vpadalq_s16(vacc1x7, vprod1x7);
+      vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7);
+      vacc3x7 = vpadalq_s16(vacc3x7, vprod3x7);
+
+      k -= 16 * sizeof(int8_t);
+    }
+
+    // Handle 8 bytes at a time using MUL.
+    if (k > 0) {
+      const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+      const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+      const int8x8_t va3 = vld1_s8(a3); a3 += 8;
+
+      const int8x8_t vb0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x0 = vmull_s8(vb0, va0);
+      const int16x8_t vprod1x0 = vmull_s8(vb0, va1);
+      const int16x8_t vprod2x0 = vmull_s8(vb0, va2);
+      const int16x8_t vprod3x0 = vmull_s8(vb0, va3);
+      vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+      vacc1x0 = vpadalq_s16(vacc1x0, vprod1x0);
+      vacc2x0 = vpadalq_s16(vacc2x0, vprod2x0);
+      vacc3x0 = vpadalq_s16(vacc3x0, vprod3x0);
+      const int8x8_t vb1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x1 = vmull_s8(vb1, va0);
+      const int16x8_t vprod1x1 = vmull_s8(vb1, va1);
+      const int16x8_t vprod2x1 = vmull_s8(vb1, va2);
+      const int16x8_t vprod3x1 = vmull_s8(vb1, va3);
+      vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+      vacc1x1 = vpadalq_s16(vacc1x1, vprod1x1);
+      vacc2x1 = vpadalq_s16(vacc2x1, vprod2x1);
+      vacc3x1 = vpadalq_s16(vacc3x1, vprod3x1);
+      const int8x8_t vb2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x2 = vmull_s8(vb2, va0);
+      const int16x8_t vprod1x2 = vmull_s8(vb2, va1);
+      const int16x8_t vprod2x2 = vmull_s8(vb2, va2);
+      const int16x8_t vprod3x2 = vmull_s8(vb2, va3);
+      vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+      vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2);
+      vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2);
+      vacc3x2 = vpadalq_s16(vacc3x2, vprod3x2);
+      const int8x8_t vb3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x3 = vmull_s8(vb3, va0);
+      const int16x8_t vprod1x3 = vmull_s8(vb3, va1);
+      const int16x8_t vprod2x3 = vmull_s8(vb3, va2);
+      const int16x8_t vprod3x3 = vmull_s8(vb3, va3);
+      vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+      vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3);
+      vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3);
+      vacc3x3 = vpadalq_s16(vacc3x3, vprod3x3);
+      const int8x8_t vb4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x4 = vmull_s8(vb4, va0);
+      const int16x8_t vprod1x4 = vmull_s8(vb4, va1);
+      const int16x8_t vprod2x4 = vmull_s8(vb4, va2);
+      const int16x8_t vprod3x4 = vmull_s8(vb4, va3);
+      vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+      vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4);
+      vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4);
+      vacc3x4 = vpadalq_s16(vacc3x4, vprod3x4);
+      const int8x8_t vb5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x5 = vmull_s8(vb5, va0);
+      const int16x8_t vprod1x5 = vmull_s8(vb5, va1);
+      const int16x8_t vprod2x5 = vmull_s8(vb5, va2);
+      const int16x8_t vprod3x5 = vmull_s8(vb5, va3);
+      vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+      vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5);
+      vacc2x5 = vpadalq_s16(vacc2x5, vprod2x5);
+      vacc3x5 = vpadalq_s16(vacc3x5, vprod3x5);
+      const int8x8_t vb6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x6 = vmull_s8(vb6, va0);
+      const int16x8_t vprod1x6 = vmull_s8(vb6, va1);
+      const int16x8_t vprod2x6 = vmull_s8(vb6, va2);
+      const int16x8_t vprod3x6 = vmull_s8(vb6, va3);
+      vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+      vacc1x6 = vpadalq_s16(vacc1x6, vprod1x6);
+      vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6);
+      vacc3x6 = vpadalq_s16(vacc3x6, vprod3x6);
+      const int8x8_t vb7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x7 = vmull_s8(vb7, va0);
+      const int16x8_t vprod1x7 = vmull_s8(vb7, va1);
+      const int16x8_t vprod2x7 = vmull_s8(vb7, va2);
+      const int16x8_t vprod3x7 = vmull_s8(vb7, va3);
+      vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+      vacc1x7 = vpadalq_s16(vacc1x7, vprod1x7);
+      vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7);
+      vacc3x7 = vpadalq_s16(vacc3x7, vprod3x7);
+
+      k -= 8 * sizeof(int8_t);
+    }
+
+#if XNN_ARCH_ARM64
+    const int32x4_t vsum0x01 = vpaddq_s32(vacc0x0, vacc0x1);
+    const int32x4_t vsum0x23 = vpaddq_s32(vacc0x2, vacc0x3);
+    const int32x4_t vsum0x45 = vpaddq_s32(vacc0x4, vacc0x5);
+    const int32x4_t vsum0x67 = vpaddq_s32(vacc0x6, vacc0x7);
+    const int32x4_t vsum1x01 = vpaddq_s32(vacc1x0, vacc1x1);
+    const int32x4_t vsum1x23 = vpaddq_s32(vacc1x2, vacc1x3);
+    const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5);
+    const int32x4_t vsum1x67 = vpaddq_s32(vacc1x6, vacc1x7);
+    const int32x4_t vsum2x01 = vpaddq_s32(vacc2x0, vacc2x1);
+    const int32x4_t vsum2x23 = vpaddq_s32(vacc2x2, vacc2x3);
+    const int32x4_t vsum2x45 = vpaddq_s32(vacc2x4, vacc2x5);
+    const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7);
+    const int32x4_t vsum3x01 = vpaddq_s32(vacc3x0, vacc3x1);
+    const int32x4_t vsum3x23 = vpaddq_s32(vacc3x2, vacc3x3);
+    const int32x4_t vsum3x45 = vpaddq_s32(vacc3x4, vacc3x5);
+    const int32x4_t vsum3x67 = vpaddq_s32(vacc3x6, vacc3x7);
+    int32x4_t vacc0x0123 = vpaddq_s32(vsum0x01, vsum0x23);
+    int32x4_t vacc0x4567 = vpaddq_s32(vsum0x45, vsum0x67);
+    int32x4_t vacc1x0123 = vpaddq_s32(vsum1x01, vsum1x23);
+    int32x4_t vacc1x4567 = vpaddq_s32(vsum1x45, vsum1x67);
+    int32x4_t vacc2x0123 = vpaddq_s32(vsum2x01, vsum2x23);
+    int32x4_t vacc2x4567 = vpaddq_s32(vsum2x45, vsum2x67);
+    int32x4_t vacc3x0123 = vpaddq_s32(vsum3x01, vsum3x23);
+    int32x4_t vacc3x4567 = vpaddq_s32(vsum3x45, vsum3x67);
+#else
+    const int32x2_t vpsum0x0 = vadd_s32(vget_low_s32(vacc0x0), vget_high_s32(vacc0x0));
+    const int32x2_t vpsum0x1 = vadd_s32(vget_low_s32(vacc0x1), vget_high_s32(vacc0x1));
+    const int32x2_t vpsum0x2 = vadd_s32(vget_low_s32(vacc0x2), vget_high_s32(vacc0x2));
+    const int32x2_t vpsum0x3 = vadd_s32(vget_low_s32(vacc0x3), vget_high_s32(vacc0x3));
+    const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1);
+    const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3);
+    int32x4_t vacc0x0123 = vcombine_s32(vsum0x01, vsum0x23 );
+    const int32x2_t vpsum0x4 = vadd_s32(vget_low_s32(vacc0x4), vget_high_s32(vacc0x4));
+    const int32x2_t vpsum0x5 = vadd_s32(vget_low_s32(vacc0x5), vget_high_s32(vacc0x5));
+    const int32x2_t vpsum0x6 = vadd_s32(vget_low_s32(vacc0x6), vget_high_s32(vacc0x6));
+    const int32x2_t vpsum0x7 = vadd_s32(vget_low_s32(vacc0x7), vget_high_s32(vacc0x7));
+    const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5);
+    const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7);
+    int32x4_t vacc0x4567 = vcombine_s32(vsum0x45, vsum0x67 );
+    const int32x2_t vpsum1x0 = vadd_s32(vget_low_s32(vacc1x0), vget_high_s32(vacc1x0));
+    const int32x2_t vpsum1x1 = vadd_s32(vget_low_s32(vacc1x1), vget_high_s32(vacc1x1));
+    const int32x2_t vpsum1x2 = vadd_s32(vget_low_s32(vacc1x2), vget_high_s32(vacc1x2));
+    const int32x2_t vpsum1x3 = vadd_s32(vget_low_s32(vacc1x3), vget_high_s32(vacc1x3));
+    const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1);
+    const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3);
+    int32x4_t vacc1x0123 = vcombine_s32(vsum1x01, vsum1x23 );
+    const int32x2_t vpsum1x4 = vadd_s32(vget_low_s32(vacc1x4), vget_high_s32(vacc1x4));
+    const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5));
+    const int32x2_t vpsum1x6 = vadd_s32(vget_low_s32(vacc1x6), vget_high_s32(vacc1x6));
+    const int32x2_t vpsum1x7 = vadd_s32(vget_low_s32(vacc1x7), vget_high_s32(vacc1x7));
+    const int32x2_t vsum1x45 = vpadd_s32(vpsum1x4, vpsum1x5);
+    const int32x2_t vsum1x67 = vpadd_s32(vpsum1x6, vpsum1x7);
+    int32x4_t vacc1x4567 = vcombine_s32(vsum1x45, vsum1x67 );
+    const int32x2_t vpsum2x0 = vadd_s32(vget_low_s32(vacc2x0), vget_high_s32(vacc2x0));
+    const int32x2_t vpsum2x1 = vadd_s32(vget_low_s32(vacc2x1), vget_high_s32(vacc2x1));
+    const int32x2_t vpsum2x2 = vadd_s32(vget_low_s32(vacc2x2), vget_high_s32(vacc2x2));
+    const int32x2_t vpsum2x3 = vadd_s32(vget_low_s32(vacc2x3), vget_high_s32(vacc2x3));
+    const int32x2_t vsum2x01 = vpadd_s32(vpsum2x0, vpsum2x1);
+    const int32x2_t vsum2x23 = vpadd_s32(vpsum2x2, vpsum2x3);
+    int32x4_t vacc2x0123 = vcombine_s32(vsum2x01, vsum2x23 );
+    const int32x2_t vpsum2x4 = vadd_s32(vget_low_s32(vacc2x4), vget_high_s32(vacc2x4));
+    const int32x2_t vpsum2x5 = vadd_s32(vget_low_s32(vacc2x5), vget_high_s32(vacc2x5));
+    const int32x2_t vpsum2x6 = vadd_s32(vget_low_s32(vacc2x6), vget_high_s32(vacc2x6));
+    const int32x2_t vpsum2x7 = vadd_s32(vget_low_s32(vacc2x7), vget_high_s32(vacc2x7));
+    const int32x2_t vsum2x45 = vpadd_s32(vpsum2x4, vpsum2x5);
+    const int32x2_t vsum2x67 = vpadd_s32(vpsum2x6, vpsum2x7);
+    int32x4_t vacc2x4567 = vcombine_s32(vsum2x45, vsum2x67 );
+    const int32x2_t vpsum3x0 = vadd_s32(vget_low_s32(vacc3x0), vget_high_s32(vacc3x0));
+    const int32x2_t vpsum3x1 = vadd_s32(vget_low_s32(vacc3x1), vget_high_s32(vacc3x1));
+    const int32x2_t vpsum3x2 = vadd_s32(vget_low_s32(vacc3x2), vget_high_s32(vacc3x2));
+    const int32x2_t vpsum3x3 = vadd_s32(vget_low_s32(vacc3x3), vget_high_s32(vacc3x3));
+    const int32x2_t vsum3x01 = vpadd_s32(vpsum3x0, vpsum3x1);
+    const int32x2_t vsum3x23 = vpadd_s32(vpsum3x2, vpsum3x3);
+    int32x4_t vacc3x0123 = vcombine_s32(vsum3x01, vsum3x23 );
+    const int32x2_t vpsum3x4 = vadd_s32(vget_low_s32(vacc3x4), vget_high_s32(vacc3x4));
+    const int32x2_t vpsum3x5 = vadd_s32(vget_low_s32(vacc3x5), vget_high_s32(vacc3x5));
+    const int32x2_t vpsum3x6 = vadd_s32(vget_low_s32(vacc3x6), vget_high_s32(vacc3x6));
+    const int32x2_t vpsum3x7 = vadd_s32(vget_low_s32(vacc3x7), vget_high_s32(vacc3x7));
+    const int32x2_t vsum3x45 = vpadd_s32(vpsum3x4, vpsum3x5);
+    const int32x2_t vsum3x67 = vpadd_s32(vpsum3x6, vpsum3x7);
+    int32x4_t vacc3x4567 = vcombine_s32(vsum3x45, vsum3x67 );
+#endif
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+    vacc3x0123 = vqrdmulhq_s32(vacc3x0123, vmultiplier);
+    vacc3x4567 = vqrdmulhq_s32(vacc3x4567, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+    vacc3x0123 = vsraq_n_s32(vacc3x0123, vbicq_s32(vacc3x0123, vzero_shift_mask), 31);
+    vacc3x4567 = vsraq_n_s32(vacc3x4567, vbicq_s32(vacc3x4567, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+    vacc3x0123 = vrshlq_s32(vacc3x0123, vright_shift);
+    vacc3x4567 = vrshlq_s32(vacc3x4567, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
+    int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
+    int8x16_t vout2x01234567_3x01234567 = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc3x01234567);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567)), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
+    int8x16_t vout2x01234567_3x01234567 = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc3x01234567));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);
+    vout2x01234567_3x01234567 = vmaxq_s8(vout2x01234567_3x01234567, voutput_min);
+
+    vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);
+    vout2x01234567_3x01234567 = vminq_s8(vout2x01234567_3x01234567, voutput_max);
+
+    if (nc >= 8) {
+      vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
+      vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
+      vst1_s8(c2 + 0, vget_low_s8(vout2x01234567_3x01234567));
+      vst1_s8(c3 + 0, vget_high_s8(vout2x01234567_3x01234567));
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 0); c2 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c3, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 2); c3 += 4;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 0); c2 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c3, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 4); c3 += 2;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c2, vout2x01234567_3x01234567, 0);
+        vst1q_lane_s8(c3, vout2x01234567_3x01234567, 8);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/4x8c8-minmax-neon-mull-padal.c b/src/qs8-gemm/gen/4x8c8-minmax-neon-mull-padal.c
new file mode 100644
index 0000000..31dddfb
--- /dev/null
+++ b/src/qs8-gemm/gen/4x8c8-minmax-neon-mull-padal.c
@@ -0,0 +1,364 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/c8-neon-mull-padal.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride);
+  int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+
+  do {
+    int32x4_t vacc0x0 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x1 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x2 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x3 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x4 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x5 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc1x0 = vacc0x0;
+    int32x4_t vacc1x1 = vacc0x1;
+    int32x4_t vacc1x2 = vacc0x2;
+    int32x4_t vacc1x3 = vacc0x3;
+    int32x4_t vacc1x4 = vacc0x4;
+    int32x4_t vacc1x5 = vacc0x5;
+    int32x4_t vacc1x6 = vacc0x6;
+    int32x4_t vacc1x7 = vacc0x7;
+    int32x4_t vacc2x0 = vacc0x0;
+    int32x4_t vacc2x1 = vacc0x1;
+    int32x4_t vacc2x2 = vacc0x2;
+    int32x4_t vacc2x3 = vacc0x3;
+    int32x4_t vacc2x4 = vacc0x4;
+    int32x4_t vacc2x5 = vacc0x5;
+    int32x4_t vacc2x6 = vacc0x6;
+    int32x4_t vacc2x7 = vacc0x7;
+    int32x4_t vacc3x0 = vacc0x0;
+    int32x4_t vacc3x1 = vacc0x1;
+    int32x4_t vacc3x2 = vacc0x2;
+    int32x4_t vacc3x3 = vacc0x3;
+    int32x4_t vacc3x4 = vacc0x4;
+    int32x4_t vacc3x5 = vacc0x5;
+    int32x4_t vacc3x6 = vacc0x6;
+    int32x4_t vacc3x7 = vacc0x7;
+
+    size_t k = kc;
+
+    // Handle 8 bytes at a time using MUL.
+    while (k > 0) {
+      const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+      const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+      const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+      const int8x8_t va3 = vld1_s8(a3); a3 += 8;
+
+      const int8x8_t vb0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x0 = vmull_s8(vb0, va0);
+      const int16x8_t vprod1x0 = vmull_s8(vb0, va1);
+      const int16x8_t vprod2x0 = vmull_s8(vb0, va2);
+      const int16x8_t vprod3x0 = vmull_s8(vb0, va3);
+      vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+      vacc1x0 = vpadalq_s16(vacc1x0, vprod1x0);
+      vacc2x0 = vpadalq_s16(vacc2x0, vprod2x0);
+      vacc3x0 = vpadalq_s16(vacc3x0, vprod3x0);
+      const int8x8_t vb1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x1 = vmull_s8(vb1, va0);
+      const int16x8_t vprod1x1 = vmull_s8(vb1, va1);
+      const int16x8_t vprod2x1 = vmull_s8(vb1, va2);
+      const int16x8_t vprod3x1 = vmull_s8(vb1, va3);
+      vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+      vacc1x1 = vpadalq_s16(vacc1x1, vprod1x1);
+      vacc2x1 = vpadalq_s16(vacc2x1, vprod2x1);
+      vacc3x1 = vpadalq_s16(vacc3x1, vprod3x1);
+      const int8x8_t vb2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x2 = vmull_s8(vb2, va0);
+      const int16x8_t vprod1x2 = vmull_s8(vb2, va1);
+      const int16x8_t vprod2x2 = vmull_s8(vb2, va2);
+      const int16x8_t vprod3x2 = vmull_s8(vb2, va3);
+      vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+      vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2);
+      vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2);
+      vacc3x2 = vpadalq_s16(vacc3x2, vprod3x2);
+      const int8x8_t vb3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x3 = vmull_s8(vb3, va0);
+      const int16x8_t vprod1x3 = vmull_s8(vb3, va1);
+      const int16x8_t vprod2x3 = vmull_s8(vb3, va2);
+      const int16x8_t vprod3x3 = vmull_s8(vb3, va3);
+      vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+      vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3);
+      vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3);
+      vacc3x3 = vpadalq_s16(vacc3x3, vprod3x3);
+      const int8x8_t vb4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x4 = vmull_s8(vb4, va0);
+      const int16x8_t vprod1x4 = vmull_s8(vb4, va1);
+      const int16x8_t vprod2x4 = vmull_s8(vb4, va2);
+      const int16x8_t vprod3x4 = vmull_s8(vb4, va3);
+      vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+      vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4);
+      vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4);
+      vacc3x4 = vpadalq_s16(vacc3x4, vprod3x4);
+      const int8x8_t vb5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x5 = vmull_s8(vb5, va0);
+      const int16x8_t vprod1x5 = vmull_s8(vb5, va1);
+      const int16x8_t vprod2x5 = vmull_s8(vb5, va2);
+      const int16x8_t vprod3x5 = vmull_s8(vb5, va3);
+      vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+      vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5);
+      vacc2x5 = vpadalq_s16(vacc2x5, vprod2x5);
+      vacc3x5 = vpadalq_s16(vacc3x5, vprod3x5);
+      const int8x8_t vb6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x6 = vmull_s8(vb6, va0);
+      const int16x8_t vprod1x6 = vmull_s8(vb6, va1);
+      const int16x8_t vprod2x6 = vmull_s8(vb6, va2);
+      const int16x8_t vprod3x6 = vmull_s8(vb6, va3);
+      vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+      vacc1x6 = vpadalq_s16(vacc1x6, vprod1x6);
+      vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6);
+      vacc3x6 = vpadalq_s16(vacc3x6, vprod3x6);
+      const int8x8_t vb7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+      const int16x8_t vprod0x7 = vmull_s8(vb7, va0);
+      const int16x8_t vprod1x7 = vmull_s8(vb7, va1);
+      const int16x8_t vprod2x7 = vmull_s8(vb7, va2);
+      const int16x8_t vprod3x7 = vmull_s8(vb7, va3);
+      vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+      vacc1x7 = vpadalq_s16(vacc1x7, vprod1x7);
+      vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7);
+      vacc3x7 = vpadalq_s16(vacc3x7, vprod3x7);
+
+      k -= 8 * sizeof(int8_t);
+    }
+
+#if XNN_ARCH_ARM64
+    const int32x4_t vsum0x01 = vpaddq_s32(vacc0x0, vacc0x1);
+    const int32x4_t vsum0x23 = vpaddq_s32(vacc0x2, vacc0x3);
+    const int32x4_t vsum0x45 = vpaddq_s32(vacc0x4, vacc0x5);
+    const int32x4_t vsum0x67 = vpaddq_s32(vacc0x6, vacc0x7);
+    const int32x4_t vsum1x01 = vpaddq_s32(vacc1x0, vacc1x1);
+    const int32x4_t vsum1x23 = vpaddq_s32(vacc1x2, vacc1x3);
+    const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5);
+    const int32x4_t vsum1x67 = vpaddq_s32(vacc1x6, vacc1x7);
+    const int32x4_t vsum2x01 = vpaddq_s32(vacc2x0, vacc2x1);
+    const int32x4_t vsum2x23 = vpaddq_s32(vacc2x2, vacc2x3);
+    const int32x4_t vsum2x45 = vpaddq_s32(vacc2x4, vacc2x5);
+    const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7);
+    const int32x4_t vsum3x01 = vpaddq_s32(vacc3x0, vacc3x1);
+    const int32x4_t vsum3x23 = vpaddq_s32(vacc3x2, vacc3x3);
+    const int32x4_t vsum3x45 = vpaddq_s32(vacc3x4, vacc3x5);
+    const int32x4_t vsum3x67 = vpaddq_s32(vacc3x6, vacc3x7);
+    int32x4_t vacc0x0123 = vpaddq_s32(vsum0x01, vsum0x23);
+    int32x4_t vacc0x4567 = vpaddq_s32(vsum0x45, vsum0x67);
+    int32x4_t vacc1x0123 = vpaddq_s32(vsum1x01, vsum1x23);
+    int32x4_t vacc1x4567 = vpaddq_s32(vsum1x45, vsum1x67);
+    int32x4_t vacc2x0123 = vpaddq_s32(vsum2x01, vsum2x23);
+    int32x4_t vacc2x4567 = vpaddq_s32(vsum2x45, vsum2x67);
+    int32x4_t vacc3x0123 = vpaddq_s32(vsum3x01, vsum3x23);
+    int32x4_t vacc3x4567 = vpaddq_s32(vsum3x45, vsum3x67);
+#else
+    const int32x2_t vpsum0x0 = vadd_s32(vget_low_s32(vacc0x0), vget_high_s32(vacc0x0));
+    const int32x2_t vpsum0x1 = vadd_s32(vget_low_s32(vacc0x1), vget_high_s32(vacc0x1));
+    const int32x2_t vpsum0x2 = vadd_s32(vget_low_s32(vacc0x2), vget_high_s32(vacc0x2));
+    const int32x2_t vpsum0x3 = vadd_s32(vget_low_s32(vacc0x3), vget_high_s32(vacc0x3));
+    const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1);
+    const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3);
+    int32x4_t vacc0x0123 = vcombine_s32(vsum0x01, vsum0x23 );
+    const int32x2_t vpsum0x4 = vadd_s32(vget_low_s32(vacc0x4), vget_high_s32(vacc0x4));
+    const int32x2_t vpsum0x5 = vadd_s32(vget_low_s32(vacc0x5), vget_high_s32(vacc0x5));
+    const int32x2_t vpsum0x6 = vadd_s32(vget_low_s32(vacc0x6), vget_high_s32(vacc0x6));
+    const int32x2_t vpsum0x7 = vadd_s32(vget_low_s32(vacc0x7), vget_high_s32(vacc0x7));
+    const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5);
+    const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7);
+    int32x4_t vacc0x4567 = vcombine_s32(vsum0x45, vsum0x67 );
+    const int32x2_t vpsum1x0 = vadd_s32(vget_low_s32(vacc1x0), vget_high_s32(vacc1x0));
+    const int32x2_t vpsum1x1 = vadd_s32(vget_low_s32(vacc1x1), vget_high_s32(vacc1x1));
+    const int32x2_t vpsum1x2 = vadd_s32(vget_low_s32(vacc1x2), vget_high_s32(vacc1x2));
+    const int32x2_t vpsum1x3 = vadd_s32(vget_low_s32(vacc1x3), vget_high_s32(vacc1x3));
+    const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1);
+    const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3);
+    int32x4_t vacc1x0123 = vcombine_s32(vsum1x01, vsum1x23 );
+    const int32x2_t vpsum1x4 = vadd_s32(vget_low_s32(vacc1x4), vget_high_s32(vacc1x4));
+    const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5));
+    const int32x2_t vpsum1x6 = vadd_s32(vget_low_s32(vacc1x6), vget_high_s32(vacc1x6));
+    const int32x2_t vpsum1x7 = vadd_s32(vget_low_s32(vacc1x7), vget_high_s32(vacc1x7));
+    const int32x2_t vsum1x45 = vpadd_s32(vpsum1x4, vpsum1x5);
+    const int32x2_t vsum1x67 = vpadd_s32(vpsum1x6, vpsum1x7);
+    int32x4_t vacc1x4567 = vcombine_s32(vsum1x45, vsum1x67 );
+    const int32x2_t vpsum2x0 = vadd_s32(vget_low_s32(vacc2x0), vget_high_s32(vacc2x0));
+    const int32x2_t vpsum2x1 = vadd_s32(vget_low_s32(vacc2x1), vget_high_s32(vacc2x1));
+    const int32x2_t vpsum2x2 = vadd_s32(vget_low_s32(vacc2x2), vget_high_s32(vacc2x2));
+    const int32x2_t vpsum2x3 = vadd_s32(vget_low_s32(vacc2x3), vget_high_s32(vacc2x3));
+    const int32x2_t vsum2x01 = vpadd_s32(vpsum2x0, vpsum2x1);
+    const int32x2_t vsum2x23 = vpadd_s32(vpsum2x2, vpsum2x3);
+    int32x4_t vacc2x0123 = vcombine_s32(vsum2x01, vsum2x23 );
+    const int32x2_t vpsum2x4 = vadd_s32(vget_low_s32(vacc2x4), vget_high_s32(vacc2x4));
+    const int32x2_t vpsum2x5 = vadd_s32(vget_low_s32(vacc2x5), vget_high_s32(vacc2x5));
+    const int32x2_t vpsum2x6 = vadd_s32(vget_low_s32(vacc2x6), vget_high_s32(vacc2x6));
+    const int32x2_t vpsum2x7 = vadd_s32(vget_low_s32(vacc2x7), vget_high_s32(vacc2x7));
+    const int32x2_t vsum2x45 = vpadd_s32(vpsum2x4, vpsum2x5);
+    const int32x2_t vsum2x67 = vpadd_s32(vpsum2x6, vpsum2x7);
+    int32x4_t vacc2x4567 = vcombine_s32(vsum2x45, vsum2x67 );
+    const int32x2_t vpsum3x0 = vadd_s32(vget_low_s32(vacc3x0), vget_high_s32(vacc3x0));
+    const int32x2_t vpsum3x1 = vadd_s32(vget_low_s32(vacc3x1), vget_high_s32(vacc3x1));
+    const int32x2_t vpsum3x2 = vadd_s32(vget_low_s32(vacc3x2), vget_high_s32(vacc3x2));
+    const int32x2_t vpsum3x3 = vadd_s32(vget_low_s32(vacc3x3), vget_high_s32(vacc3x3));
+    const int32x2_t vsum3x01 = vpadd_s32(vpsum3x0, vpsum3x1);
+    const int32x2_t vsum3x23 = vpadd_s32(vpsum3x2, vpsum3x3);
+    int32x4_t vacc3x0123 = vcombine_s32(vsum3x01, vsum3x23 );
+    const int32x2_t vpsum3x4 = vadd_s32(vget_low_s32(vacc3x4), vget_high_s32(vacc3x4));
+    const int32x2_t vpsum3x5 = vadd_s32(vget_low_s32(vacc3x5), vget_high_s32(vacc3x5));
+    const int32x2_t vpsum3x6 = vadd_s32(vget_low_s32(vacc3x6), vget_high_s32(vacc3x6));
+    const int32x2_t vpsum3x7 = vadd_s32(vget_low_s32(vacc3x7), vget_high_s32(vacc3x7));
+    const int32x2_t vsum3x45 = vpadd_s32(vpsum3x4, vpsum3x5);
+    const int32x2_t vsum3x67 = vpadd_s32(vpsum3x6, vpsum3x7);
+    int32x4_t vacc3x4567 = vcombine_s32(vsum3x45, vsum3x67 );
+#endif
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+    vacc3x0123 = vqrdmulhq_s32(vacc3x0123, vmultiplier);
+    vacc3x4567 = vqrdmulhq_s32(vacc3x4567, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+    vacc3x0123 = vsraq_n_s32(vacc3x0123, vbicq_s32(vacc3x0123, vzero_shift_mask), 31);
+    vacc3x4567 = vsraq_n_s32(vacc3x4567, vbicq_s32(vacc3x4567, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+    vacc3x0123 = vrshlq_s32(vacc3x0123, vright_shift);
+    vacc3x4567 = vrshlq_s32(vacc3x4567, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
+    int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
+    int8x16_t vout2x01234567_3x01234567 = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc3x01234567);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567)), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
+    int8x16_t vout2x01234567_3x01234567 = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc3x01234567));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);
+    vout2x01234567_3x01234567 = vmaxq_s8(vout2x01234567_3x01234567, voutput_min);
+
+    vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);
+    vout2x01234567_3x01234567 = vminq_s8(vout2x01234567_3x01234567, voutput_max);
+
+    if (nc >= 8) {
+      vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
+      vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
+      vst1_s8(c2 + 0, vget_low_s8(vout2x01234567_3x01234567));
+      vst1_s8(c3 + 0, vget_high_s8(vout2x01234567_3x01234567));
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 0); c2 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c3, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 2); c3 += 4;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 0); c2 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c3, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 4); c3 += 2;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c2, vout2x01234567_3x01234567, 0);
+        vst1q_lane_s8(c3, vout2x01234567_3x01234567, 8);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/6x16c4-minmax-neondot.c b/src/qs8-gemm/gen/6x16c4-minmax-neondot.c
index 7cf7166..0086b49 100644
--- a/src/qs8-gemm/gen/6x16c4-minmax-neondot.c
+++ b/src/qs8-gemm/gen/6x16c4-minmax-neondot.c
@@ -7,18 +7,14 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-
 #include <assert.h>
 
 #include <arm_neon.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
-// This kernel uses ARMv8.2 dot-product instructions.
-//
-// Scalar model: xnn_qs8_gemm_minmax_ukernel_6x16c4__scalar. Refer to
-// that kernel for more comments.
 void xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot(
     size_t mr,
     size_t nc,
@@ -34,7 +30,12 @@
   assert(mr <= 6);
   assert(nc != 0);
   assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
 
+  kc = round_up_po2(kc, 4);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -110,14 +111,14 @@
       const int8x8_t va5x01234567 = vld1_s8(a5); a5 += 8;
 
       // Load a 8x16 block of weights.
-      const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb0123x89AB = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb0123xCDEF = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb4567x89AB = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb4567xCDEF = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
+      const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb0123x89AB = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb0123xCDEF = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb4567x89AB = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb4567xCDEF = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
 
       // Multiply-accumulate: 6x8 * 8x16 --> 6x16.
       vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb0123x0123, va0x01234567, 0);
@@ -171,21 +172,21 @@
 
       k -= 8 * sizeof(int8_t);
     }
-    // Handle up to 7 final positions of `k`
+    // Handle up to 4 final positions of `k`
     if XNN_UNLIKELY(k != 0) {
       // Load a 6x4 block of activations.
-      const int8x8_t va0x01234567 = vld1_s8(a0); a0 += k;
-      const int8x8_t va1x01234567 = vld1_s8(a1); a1 += k;
-      const int8x8_t va2x01234567 = vld1_s8(a2); a2 += k;
-      const int8x8_t va3x01234567 = vld1_s8(a3); a3 += k;
-      const int8x8_t va4x01234567 = vld1_s8(a4); a4 += k;
-      const int8x8_t va5x01234567 = vld1_s8(a5); a5 += k;
+      const int8x8_t va0x01234567 = vld1_s8(a0); a0 += 4;
+      const int8x8_t va1x01234567 = vld1_s8(a1); a1 += 4;
+      const int8x8_t va2x01234567 = vld1_s8(a2); a2 += 4;
+      const int8x8_t va3x01234567 = vld1_s8(a3); a3 += 4;
+      const int8x8_t va4x01234567 = vld1_s8(a4); a4 += 4;
+      const int8x8_t va5x01234567 = vld1_s8(a5); a5 += 4;
 
       // Load a 4x16 block of weights.
-      const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb0123x89AB = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb0123xCDEF = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
+      const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb0123x89AB = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb0123xCDEF = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
 
       // Multiply-accumulate: 6x4 * 4x16 --> 6x16.
       vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb0123x0123, va0x01234567, 0);
@@ -212,52 +213,9 @@
       vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb0123x4567, va5x01234567, 0);
       vacc5x89AB = vdotq_lane_s32(vacc5x89AB, vb0123x89AB, va5x01234567, 0);
       vacc5xCDEF = vdotq_lane_s32(vacc5xCDEF, vb0123xCDEF, va5x01234567, 0);
-
-      if (k > 4) {
-        // Load a 4x16 block of weights.
-        const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb4567x89AB = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb4567xCDEF = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-
-        // Multiply-accumulate: 6x4 * 4x16 --> 6x16.
-        vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
-        vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
-        vacc0x89AB = vdotq_lane_s32(vacc0x89AB, vb4567x89AB, va0x01234567, 1);
-        vacc0xCDEF = vdotq_lane_s32(vacc0xCDEF, vb4567xCDEF, va0x01234567, 1);
-        vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1);
-        vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1);
-        vacc1x89AB = vdotq_lane_s32(vacc1x89AB, vb4567x89AB, va1x01234567, 1);
-        vacc1xCDEF = vdotq_lane_s32(vacc1xCDEF, vb4567xCDEF, va1x01234567, 1);
-        vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1);
-        vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1);
-        vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb4567x89AB, va2x01234567, 1);
-        vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb4567xCDEF, va2x01234567, 1);
-        vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb4567x0123, va3x01234567, 1);
-        vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb4567x4567, va3x01234567, 1);
-        vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb4567x89AB, va3x01234567, 1);
-        vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb4567xCDEF, va3x01234567, 1);
-        vacc4x0123 = vdotq_lane_s32(vacc4x0123, vb4567x0123, va4x01234567, 1);
-        vacc4x4567 = vdotq_lane_s32(vacc4x4567, vb4567x4567, va4x01234567, 1);
-        vacc4x89AB = vdotq_lane_s32(vacc4x89AB, vb4567x89AB, va4x01234567, 1);
-        vacc4xCDEF = vdotq_lane_s32(vacc4xCDEF, vb4567xCDEF, va4x01234567, 1);
-        vacc5x0123 = vdotq_lane_s32(vacc5x0123, vb4567x0123, va5x01234567, 1);
-        vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb4567x4567, va5x01234567, 1);
-        vacc5x89AB = vdotq_lane_s32(vacc5x89AB, vb4567x89AB, va5x01234567, 1);
-        vacc5xCDEF = vdotq_lane_s32(vacc5xCDEF, vb4567xCDEF, va5x01234567, 1);
-      }
     }
-    // End of accumulation loop. The variable `kc` contains the amount by which
-    // we advanced the `va` pointers, so we rewind by this amount now.
-    a0 = (const int8_t*)((uintptr_t)a0 - kc);
-    a1 = (const int8_t*)((uintptr_t)a1 - kc);
-    a2 = (const int8_t*)((uintptr_t)a2 - kc);
-    a3 = (const int8_t*)((uintptr_t)a3 - kc);
-    a4 = (const int8_t*)((uintptr_t)a4 - kc);
-    a5 = (const int8_t*)((uintptr_t)a5 - kc);
 
     // Post-accumulation work
-
     const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
     const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
 
@@ -412,6 +370,13 @@
       c4 = (int8_t*) ((uintptr_t) c4 + cn_stride);
       c5 = (int8_t*) ((uintptr_t) c5 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+      a4 = (const int8_t*) ((uintptr_t) a4 - kc);
+      a5 = (const int8_t*) ((uintptr_t) a5 - kc);
+
       nc -= 16;
     } else {
       // Final case where not all of the 16 columns fit in the destination.
diff --git a/src/qs8-gemm/gen/6x8c4-minmax-neondot.c b/src/qs8-gemm/gen/6x8c4-minmax-neondot.c
index 3af0fd8..889b948 100644
--- a/src/qs8-gemm/gen/6x8c4-minmax-neondot.c
+++ b/src/qs8-gemm/gen/6x8c4-minmax-neondot.c
@@ -7,18 +7,14 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-
 #include <assert.h>
 
 #include <arm_neon.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
-// This kernel uses ARMv8.2 dot-product instructions.
-//
-// Scalar model: xnn_qs8_gemm_minmax_ukernel_6x8c4__scalar. Refer to
-// that kernel for more comments.
 void xnn_qs8_gemm_minmax_ukernel_6x8c4__neondot(
     size_t mr,
     size_t nc,
@@ -34,7 +30,12 @@
   assert(mr <= 6);
   assert(nc != 0);
   assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
 
+  kc = round_up_po2(kc, 4);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -98,10 +99,10 @@
       const int8x8_t va5x01234567 = vld1_s8(a5); a5 += 8;
 
       // Load a 8x8 block of weights.
-      const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
+      const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
 
       // Multiply-accumulate: 6x8 * 8x8 --> 6x8.
       vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb0123x0123, va0x01234567, 0);
@@ -131,19 +132,19 @@
 
       k -= 8 * sizeof(int8_t);
     }
-    // Handle up to 7 final positions of `k`
+    // Handle up to 4 final positions of `k`
     if XNN_UNLIKELY(k != 0) {
       // Load a 6x4 block of activations.
-      const int8x8_t va0x01234567 = vld1_s8(a0); a0 += k;
-      const int8x8_t va1x01234567 = vld1_s8(a1); a1 += k;
-      const int8x8_t va2x01234567 = vld1_s8(a2); a2 += k;
-      const int8x8_t va3x01234567 = vld1_s8(a3); a3 += k;
-      const int8x8_t va4x01234567 = vld1_s8(a4); a4 += k;
-      const int8x8_t va5x01234567 = vld1_s8(a5); a5 += k;
+      const int8x8_t va0x01234567 = vld1_s8(a0); a0 += 4;
+      const int8x8_t va1x01234567 = vld1_s8(a1); a1 += 4;
+      const int8x8_t va2x01234567 = vld1_s8(a2); a2 += 4;
+      const int8x8_t va3x01234567 = vld1_s8(a3); a3 += 4;
+      const int8x8_t va4x01234567 = vld1_s8(a4); a4 += 4;
+      const int8x8_t va5x01234567 = vld1_s8(a5); a5 += 4;
 
       // Load a 4x8 block of weights.
-      const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
+      const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
 
       // Multiply-accumulate: 6x4 * 4x8 --> 6x8.
       vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb0123x0123, va0x01234567, 0);
@@ -158,38 +159,9 @@
       vacc4x4567 = vdotq_lane_s32(vacc4x4567, vb0123x4567, va4x01234567, 0);
       vacc5x0123 = vdotq_lane_s32(vacc5x0123, vb0123x0123, va5x01234567, 0);
       vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb0123x4567, va5x01234567, 0);
-
-      if (k > 4) {
-        // Load a 4x8 block of weights.
-        const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-
-        // Multiply-accumulate: 6x4 * 4x8 --> 6x8.
-        vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
-        vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
-        vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1);
-        vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1);
-        vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1);
-        vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1);
-        vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb4567x0123, va3x01234567, 1);
-        vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb4567x4567, va3x01234567, 1);
-        vacc4x0123 = vdotq_lane_s32(vacc4x0123, vb4567x0123, va4x01234567, 1);
-        vacc4x4567 = vdotq_lane_s32(vacc4x4567, vb4567x4567, va4x01234567, 1);
-        vacc5x0123 = vdotq_lane_s32(vacc5x0123, vb4567x0123, va5x01234567, 1);
-        vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb4567x4567, va5x01234567, 1);
-      }
     }
-    // End of accumulation loop. The variable `kc` contains the amount by which
-    // we advanced the `va` pointers, so we rewind by this amount now.
-    a0 = (const int8_t*)((uintptr_t)a0 - kc);
-    a1 = (const int8_t*)((uintptr_t)a1 - kc);
-    a2 = (const int8_t*)((uintptr_t)a2 - kc);
-    a3 = (const int8_t*)((uintptr_t)a3 - kc);
-    a4 = (const int8_t*)((uintptr_t)a4 - kc);
-    a5 = (const int8_t*)((uintptr_t)a5 - kc);
 
     // Post-accumulation work
-
     const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
     const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
 
@@ -284,6 +256,13 @@
       c4 = (int8_t*) ((uintptr_t) c4 + cn_stride);
       c5 = (int8_t*) ((uintptr_t) c5 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+      a4 = (const int8_t*) ((uintptr_t) a4 - kc);
+      a5 = (const int8_t*) ((uintptr_t) a5 - kc);
+
       nc -= 8;
     } else {
       // Final case where not all of the 8 columns fit in the destination.
diff --git a/src/qs8-gemm/gen/8x16c4-minmax-neondot.c b/src/qs8-gemm/gen/8x16c4-minmax-neondot.c
index 62bb42c..3b2301b 100644
--- a/src/qs8-gemm/gen/8x16c4-minmax-neondot.c
+++ b/src/qs8-gemm/gen/8x16c4-minmax-neondot.c
@@ -7,18 +7,14 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-
 #include <assert.h>
 
 #include <arm_neon.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
-// This kernel uses ARMv8.2 dot-product instructions.
-//
-// Scalar model: xnn_qs8_gemm_minmax_ukernel_8x16c4__scalar. Refer to
-// that kernel for more comments.
 void xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot(
     size_t mr,
     size_t nc,
@@ -34,7 +30,12 @@
   assert(mr <= 8);
   assert(nc != 0);
   assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
 
+  kc = round_up_po2(kc, 4);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -132,14 +133,14 @@
       const int8x8_t va7x01234567 = vld1_s8(a7); a7 += 8;
 
       // Load a 8x16 block of weights.
-      const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb0123x89AB = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb0123xCDEF = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb4567x89AB = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb4567xCDEF = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
+      const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb0123x89AB = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb0123xCDEF = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb4567x89AB = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb4567xCDEF = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
 
       // Multiply-accumulate: 8x8 * 8x16 --> 8x16.
       vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb0123x0123, va0x01234567, 0);
@@ -209,23 +210,23 @@
 
       k -= 8 * sizeof(int8_t);
     }
-    // Handle up to 7 final positions of `k`
+    // Handle up to 4 final positions of `k`
     if XNN_UNLIKELY(k != 0) {
       // Load a 8x4 block of activations.
-      const int8x8_t va0x01234567 = vld1_s8(a0); a0 += k;
-      const int8x8_t va1x01234567 = vld1_s8(a1); a1 += k;
-      const int8x8_t va2x01234567 = vld1_s8(a2); a2 += k;
-      const int8x8_t va3x01234567 = vld1_s8(a3); a3 += k;
-      const int8x8_t va4x01234567 = vld1_s8(a4); a4 += k;
-      const int8x8_t va5x01234567 = vld1_s8(a5); a5 += k;
-      const int8x8_t va6x01234567 = vld1_s8(a6); a6 += k;
-      const int8x8_t va7x01234567 = vld1_s8(a7); a7 += k;
+      const int8x8_t va0x01234567 = vld1_s8(a0); a0 += 4;
+      const int8x8_t va1x01234567 = vld1_s8(a1); a1 += 4;
+      const int8x8_t va2x01234567 = vld1_s8(a2); a2 += 4;
+      const int8x8_t va3x01234567 = vld1_s8(a3); a3 += 4;
+      const int8x8_t va4x01234567 = vld1_s8(a4); a4 += 4;
+      const int8x8_t va5x01234567 = vld1_s8(a5); a5 += 4;
+      const int8x8_t va6x01234567 = vld1_s8(a6); a6 += 4;
+      const int8x8_t va7x01234567 = vld1_s8(a7); a7 += 4;
 
       // Load a 4x16 block of weights.
-      const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb0123x89AB = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb0123xCDEF = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
+      const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb0123x89AB = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb0123xCDEF = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
 
       // Multiply-accumulate: 8x4 * 4x16 --> 8x16.
       vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb0123x0123, va0x01234567, 0);
@@ -260,62 +261,9 @@
       vacc7x4567 = vdotq_lane_s32(vacc7x4567, vb0123x4567, va7x01234567, 0);
       vacc7x89AB = vdotq_lane_s32(vacc7x89AB, vb0123x89AB, va7x01234567, 0);
       vacc7xCDEF = vdotq_lane_s32(vacc7xCDEF, vb0123xCDEF, va7x01234567, 0);
-
-      if (k > 4) {
-        // Load a 4x16 block of weights.
-        const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb4567x89AB = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb4567xCDEF = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-
-        // Multiply-accumulate: 8x4 * 4x16 --> 8x16.
-        vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
-        vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
-        vacc0x89AB = vdotq_lane_s32(vacc0x89AB, vb4567x89AB, va0x01234567, 1);
-        vacc0xCDEF = vdotq_lane_s32(vacc0xCDEF, vb4567xCDEF, va0x01234567, 1);
-        vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1);
-        vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1);
-        vacc1x89AB = vdotq_lane_s32(vacc1x89AB, vb4567x89AB, va1x01234567, 1);
-        vacc1xCDEF = vdotq_lane_s32(vacc1xCDEF, vb4567xCDEF, va1x01234567, 1);
-        vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1);
-        vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1);
-        vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb4567x89AB, va2x01234567, 1);
-        vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb4567xCDEF, va2x01234567, 1);
-        vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb4567x0123, va3x01234567, 1);
-        vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb4567x4567, va3x01234567, 1);
-        vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb4567x89AB, va3x01234567, 1);
-        vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb4567xCDEF, va3x01234567, 1);
-        vacc4x0123 = vdotq_lane_s32(vacc4x0123, vb4567x0123, va4x01234567, 1);
-        vacc4x4567 = vdotq_lane_s32(vacc4x4567, vb4567x4567, va4x01234567, 1);
-        vacc4x89AB = vdotq_lane_s32(vacc4x89AB, vb4567x89AB, va4x01234567, 1);
-        vacc4xCDEF = vdotq_lane_s32(vacc4xCDEF, vb4567xCDEF, va4x01234567, 1);
-        vacc5x0123 = vdotq_lane_s32(vacc5x0123, vb4567x0123, va5x01234567, 1);
-        vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb4567x4567, va5x01234567, 1);
-        vacc5x89AB = vdotq_lane_s32(vacc5x89AB, vb4567x89AB, va5x01234567, 1);
-        vacc5xCDEF = vdotq_lane_s32(vacc5xCDEF, vb4567xCDEF, va5x01234567, 1);
-        vacc6x0123 = vdotq_lane_s32(vacc6x0123, vb4567x0123, va6x01234567, 1);
-        vacc6x4567 = vdotq_lane_s32(vacc6x4567, vb4567x4567, va6x01234567, 1);
-        vacc6x89AB = vdotq_lane_s32(vacc6x89AB, vb4567x89AB, va6x01234567, 1);
-        vacc6xCDEF = vdotq_lane_s32(vacc6xCDEF, vb4567xCDEF, va6x01234567, 1);
-        vacc7x0123 = vdotq_lane_s32(vacc7x0123, vb4567x0123, va7x01234567, 1);
-        vacc7x4567 = vdotq_lane_s32(vacc7x4567, vb4567x4567, va7x01234567, 1);
-        vacc7x89AB = vdotq_lane_s32(vacc7x89AB, vb4567x89AB, va7x01234567, 1);
-        vacc7xCDEF = vdotq_lane_s32(vacc7xCDEF, vb4567xCDEF, va7x01234567, 1);
-      }
     }
-    // End of accumulation loop. The variable `kc` contains the amount by which
-    // we advanced the `va` pointers, so we rewind by this amount now.
-    a0 = (const int8_t*)((uintptr_t)a0 - kc);
-    a1 = (const int8_t*)((uintptr_t)a1 - kc);
-    a2 = (const int8_t*)((uintptr_t)a2 - kc);
-    a3 = (const int8_t*)((uintptr_t)a3 - kc);
-    a4 = (const int8_t*)((uintptr_t)a4 - kc);
-    a5 = (const int8_t*)((uintptr_t)a5 - kc);
-    a6 = (const int8_t*)((uintptr_t)a6 - kc);
-    a7 = (const int8_t*)((uintptr_t)a7 - kc);
 
     // Post-accumulation work
-
     const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
     const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
 
@@ -514,6 +462,15 @@
       c6 = (int8_t*) ((uintptr_t) c6 + cn_stride);
       c7 = (int8_t*) ((uintptr_t) c7 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+      a4 = (const int8_t*) ((uintptr_t) a4 - kc);
+      a5 = (const int8_t*) ((uintptr_t) a5 - kc);
+      a6 = (const int8_t*) ((uintptr_t) a6 - kc);
+      a7 = (const int8_t*) ((uintptr_t) a7 - kc);
+
       nc -= 16;
     } else {
       // Final case where not all of the 16 columns fit in the destination.
diff --git a/src/qs8-gemm/gen/8x8c4-minmax-neondot.c b/src/qs8-gemm/gen/8x8c4-minmax-neondot.c
index 4c84498..204f3be 100644
--- a/src/qs8-gemm/gen/8x8c4-minmax-neondot.c
+++ b/src/qs8-gemm/gen/8x8c4-minmax-neondot.c
@@ -7,18 +7,14 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-
 #include <assert.h>
 
 #include <arm_neon.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
-// This kernel uses ARMv8.2 dot-product instructions.
-//
-// Scalar model: xnn_qs8_gemm_minmax_ukernel_8x8c4__scalar. Refer to
-// that kernel for more comments.
 void xnn_qs8_gemm_minmax_ukernel_8x8c4__neondot(
     size_t mr,
     size_t nc,
@@ -34,7 +30,12 @@
   assert(mr <= 8);
   assert(nc != 0);
   assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
 
+  kc = round_up_po2(kc, 4);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -116,10 +117,10 @@
       const int8x8_t va7x01234567 = vld1_s8(a7); a7 += 8;
 
       // Load a 8x8 block of weights.
-      const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
+      const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
 
       // Multiply-accumulate: 8x8 * 8x8 --> 8x8.
       vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb0123x0123, va0x01234567, 0);
@@ -157,21 +158,21 @@
 
       k -= 8 * sizeof(int8_t);
     }
-    // Handle up to 7 final positions of `k`
+    // Handle up to 4 final positions of `k`
     if XNN_UNLIKELY(k != 0) {
       // Load a 8x4 block of activations.
-      const int8x8_t va0x01234567 = vld1_s8(a0); a0 += k;
-      const int8x8_t va1x01234567 = vld1_s8(a1); a1 += k;
-      const int8x8_t va2x01234567 = vld1_s8(a2); a2 += k;
-      const int8x8_t va3x01234567 = vld1_s8(a3); a3 += k;
-      const int8x8_t va4x01234567 = vld1_s8(a4); a4 += k;
-      const int8x8_t va5x01234567 = vld1_s8(a5); a5 += k;
-      const int8x8_t va6x01234567 = vld1_s8(a6); a6 += k;
-      const int8x8_t va7x01234567 = vld1_s8(a7); a7 += k;
+      const int8x8_t va0x01234567 = vld1_s8(a0); a0 += 4;
+      const int8x8_t va1x01234567 = vld1_s8(a1); a1 += 4;
+      const int8x8_t va2x01234567 = vld1_s8(a2); a2 += 4;
+      const int8x8_t va3x01234567 = vld1_s8(a3); a3 += 4;
+      const int8x8_t va4x01234567 = vld1_s8(a4); a4 += 4;
+      const int8x8_t va5x01234567 = vld1_s8(a5); a5 += 4;
+      const int8x8_t va6x01234567 = vld1_s8(a6); a6 += 4;
+      const int8x8_t va7x01234567 = vld1_s8(a7); a7 += 4;
 
       // Load a 4x8 block of weights.
-      const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-      const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
+      const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+      const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
 
       // Multiply-accumulate: 8x4 * 4x8 --> 8x8.
       vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb0123x0123, va0x01234567, 0);
@@ -190,44 +191,9 @@
       vacc6x4567 = vdotq_lane_s32(vacc6x4567, vb0123x4567, va6x01234567, 0);
       vacc7x0123 = vdotq_lane_s32(vacc7x0123, vb0123x0123, va7x01234567, 0);
       vacc7x4567 = vdotq_lane_s32(vacc7x4567, vb0123x4567, va7x01234567, 0);
-
-      if (k > 4) {
-        // Load a 4x8 block of weights.
-        const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-
-        // Multiply-accumulate: 8x4 * 4x8 --> 8x8.
-        vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
-        vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
-        vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1);
-        vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1);
-        vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1);
-        vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1);
-        vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb4567x0123, va3x01234567, 1);
-        vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb4567x4567, va3x01234567, 1);
-        vacc4x0123 = vdotq_lane_s32(vacc4x0123, vb4567x0123, va4x01234567, 1);
-        vacc4x4567 = vdotq_lane_s32(vacc4x4567, vb4567x4567, va4x01234567, 1);
-        vacc5x0123 = vdotq_lane_s32(vacc5x0123, vb4567x0123, va5x01234567, 1);
-        vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb4567x4567, va5x01234567, 1);
-        vacc6x0123 = vdotq_lane_s32(vacc6x0123, vb4567x0123, va6x01234567, 1);
-        vacc6x4567 = vdotq_lane_s32(vacc6x4567, vb4567x4567, va6x01234567, 1);
-        vacc7x0123 = vdotq_lane_s32(vacc7x0123, vb4567x0123, va7x01234567, 1);
-        vacc7x4567 = vdotq_lane_s32(vacc7x4567, vb4567x4567, va7x01234567, 1);
-      }
     }
-    // End of accumulation loop. The variable `kc` contains the amount by which
-    // we advanced the `va` pointers, so we rewind by this amount now.
-    a0 = (const int8_t*)((uintptr_t)a0 - kc);
-    a1 = (const int8_t*)((uintptr_t)a1 - kc);
-    a2 = (const int8_t*)((uintptr_t)a2 - kc);
-    a3 = (const int8_t*)((uintptr_t)a3 - kc);
-    a4 = (const int8_t*)((uintptr_t)a4 - kc);
-    a5 = (const int8_t*)((uintptr_t)a5 - kc);
-    a6 = (const int8_t*)((uintptr_t)a6 - kc);
-    a7 = (const int8_t*)((uintptr_t)a7 - kc);
 
     // Post-accumulation work
-
     const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
     const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
 
@@ -346,6 +312,15 @@
       c6 = (int8_t*) ((uintptr_t) c6 + cn_stride);
       c7 = (int8_t*) ((uintptr_t) c7 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+      a4 = (const int8_t*) ((uintptr_t) a4 - kc);
+      a5 = (const int8_t*) ((uintptr_t) a5 - kc);
+      a6 = (const int8_t*) ((uintptr_t) a6 - kc);
+      a7 = (const int8_t*) ((uintptr_t) a7 - kc);
+
       nc -= 8;
     } else {
       // Final case where not all of the 8 columns fit in the destination.
diff --git a/src/qs8-gemm/gen/8x8c4-minmax-scalar.c b/src/qs8-gemm/gen/8x8c4-minmax-scalar.c
deleted file mode 100644
index a07ea26..0000000
--- a/src/qs8-gemm/gen/8x8c4-minmax-scalar.c
+++ /dev/null
@@ -1,1157 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/qs8-gemm/MRxNRc4-scalar.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <xnnpack/gemm.h>
-
-#include <xnnpack/scalar-utils.h>
-
-// This kernel is a scalar model for a kernel using ARMv8.2 dot-product
-// instructions.
-//
-// XNN_DISABLE_TSAN is used because this kernel reads up to 3 bytes past the
-// bounds of the `a` matrix region, which may be a race condition with
-// another thread. We deem this acceptable because the values that are
-// read out of bounds do not affect the result, and the the compiler can't know
-// about this undefined behavior.
-void xnn_qs8_gemm_minmax_ukernel_8x8c4__scalar(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    const int8_t* restrict a,
-    size_t a_stride,
-    const void* restrict w,
-    int8_t* restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN {
-  assert(mr != 0);
-  assert(mr <= 8);
-  assert(nc != 0);
-  assert(kc != 0);
-
-  const int8_t* a0 = a;
-  int8_t* c0 = c;
-  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
-  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 2) {
-    a1 = a0;
-    c1 = c0;
-  }
-  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
-  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 2) {
-    a2 = a1;
-    c2 = c1;
-  }
-  const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride);
-  int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 4) {
-    a3 = a2;
-    c3 = c2;
-  }
-  const int8_t* a4 = (const int8_t*) ((uintptr_t) a3 + a_stride);
-  int8_t* c4 = (int8_t*) ((uintptr_t) c3 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 4) {
-    a4 = a3;
-    c4 = c3;
-  }
-  const int8_t* a5 = (const int8_t*) ((uintptr_t) a4 + a_stride);
-  int8_t* c5 = (int8_t*) ((uintptr_t) c4 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 6) {
-    a5 = a4;
-    c5 = c4;
-  }
-  const int8_t* a6 = (const int8_t*) ((uintptr_t) a5 + a_stride);
-  int8_t* c6 = (int8_t*) ((uintptr_t) c5 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 6) {
-    a6 = a5;
-    c6 = c5;
-  }
-  const int8_t* a7 = (const int8_t*) ((uintptr_t) a6 + a_stride);
-  int8_t* c7 = (int8_t*) ((uintptr_t) c6 + cm_stride);
-  if XNN_UNPREDICTABLE(mr != 8) {
-    a7 = a6;
-    c7 = c6;
-  }
-
-  // Loop over groups of 8 columns.
-  do {
-    // `vaccMN` is the accumulator at row `M`, column `N`.
-    // Initialize accumulators with bias. 8 bias values are loaded from the
-    // weight matrix, at the start of the group of 8 columns.
-    int32_t bias0 = ((const int32_t*)w)[0];
-    int32_t vacc00 = bias0;
-    int32_t vacc10 = bias0;
-    int32_t vacc20 = bias0;
-    int32_t vacc30 = bias0;
-    int32_t vacc40 = bias0;
-    int32_t vacc50 = bias0;
-    int32_t vacc60 = bias0;
-    int32_t vacc70 = bias0;
-    int32_t bias1 = ((const int32_t*)w)[1];
-    int32_t vacc01 = bias1;
-    int32_t vacc11 = bias1;
-    int32_t vacc21 = bias1;
-    int32_t vacc31 = bias1;
-    int32_t vacc41 = bias1;
-    int32_t vacc51 = bias1;
-    int32_t vacc61 = bias1;
-    int32_t vacc71 = bias1;
-    int32_t bias2 = ((const int32_t*)w)[2];
-    int32_t vacc02 = bias2;
-    int32_t vacc12 = bias2;
-    int32_t vacc22 = bias2;
-    int32_t vacc32 = bias2;
-    int32_t vacc42 = bias2;
-    int32_t vacc52 = bias2;
-    int32_t vacc62 = bias2;
-    int32_t vacc72 = bias2;
-    int32_t bias3 = ((const int32_t*)w)[3];
-    int32_t vacc03 = bias3;
-    int32_t vacc13 = bias3;
-    int32_t vacc23 = bias3;
-    int32_t vacc33 = bias3;
-    int32_t vacc43 = bias3;
-    int32_t vacc53 = bias3;
-    int32_t vacc63 = bias3;
-    int32_t vacc73 = bias3;
-    int32_t bias4 = ((const int32_t*)w)[4];
-    int32_t vacc04 = bias4;
-    int32_t vacc14 = bias4;
-    int32_t vacc24 = bias4;
-    int32_t vacc34 = bias4;
-    int32_t vacc44 = bias4;
-    int32_t vacc54 = bias4;
-    int32_t vacc64 = bias4;
-    int32_t vacc74 = bias4;
-    int32_t bias5 = ((const int32_t*)w)[5];
-    int32_t vacc05 = bias5;
-    int32_t vacc15 = bias5;
-    int32_t vacc25 = bias5;
-    int32_t vacc35 = bias5;
-    int32_t vacc45 = bias5;
-    int32_t vacc55 = bias5;
-    int32_t vacc65 = bias5;
-    int32_t vacc75 = bias5;
-    int32_t bias6 = ((const int32_t*)w)[6];
-    int32_t vacc06 = bias6;
-    int32_t vacc16 = bias6;
-    int32_t vacc26 = bias6;
-    int32_t vacc36 = bias6;
-    int32_t vacc46 = bias6;
-    int32_t vacc56 = bias6;
-    int32_t vacc66 = bias6;
-    int32_t vacc76 = bias6;
-    int32_t bias7 = ((const int32_t*)w)[7];
-    int32_t vacc07 = bias7;
-    int32_t vacc17 = bias7;
-    int32_t vacc27 = bias7;
-    int32_t vacc37 = bias7;
-    int32_t vacc47 = bias7;
-    int32_t vacc57 = bias7;
-    int32_t vacc67 = bias7;
-    int32_t vacc77 = bias7;
-
-    w = (const void*)((uintptr_t)w + 8 * sizeof(int32_t));
-
-    // Inner accumulation loop along the 8 columns.
-    // Handle 4 rows at each iteration: this is key to modelling what an
-    // actual kernel using ARMv8.2 dot-product instructions would look like.
-    size_t k = 0;
-    while (k < kc) {
-      // Load a 8x4 block of activations.
-      int32_t va00 = *a0++;
-      int32_t va01 = *a0++;
-      int32_t va02 = *a0++;
-      int32_t va03 = *a0++;
-      int32_t va10 = *a1++;
-      int32_t va11 = *a1++;
-      int32_t va12 = *a1++;
-      int32_t va13 = *a1++;
-      int32_t va20 = *a2++;
-      int32_t va21 = *a2++;
-      int32_t va22 = *a2++;
-      int32_t va23 = *a2++;
-      int32_t va30 = *a3++;
-      int32_t va31 = *a3++;
-      int32_t va32 = *a3++;
-      int32_t va33 = *a3++;
-      int32_t va40 = *a4++;
-      int32_t va41 = *a4++;
-      int32_t va42 = *a4++;
-      int32_t va43 = *a4++;
-      int32_t va50 = *a5++;
-      int32_t va51 = *a5++;
-      int32_t va52 = *a5++;
-      int32_t va53 = *a5++;
-      int32_t va60 = *a6++;
-      int32_t va61 = *a6++;
-      int32_t va62 = *a6++;
-      int32_t va63 = *a6++;
-      int32_t va70 = *a7++;
-      int32_t va71 = *a7++;
-      int32_t va72 = *a7++;
-      int32_t va73 = *a7++;
-
-      // Load a 4x8 block of weights.
-      int32_t vb00 = ((const int8_t*)w)[0];
-      int32_t vb10 = ((const int8_t*)w)[1];
-      int32_t vb20 = ((const int8_t*)w)[2];
-      int32_t vb30 = ((const int8_t*)w)[3];
-
-      w = (const void*)((uintptr_t)w + 4 * sizeof(int8_t));
-      int32_t vb01 = ((const int8_t*)w)[0];
-      int32_t vb11 = ((const int8_t*)w)[1];
-      int32_t vb21 = ((const int8_t*)w)[2];
-      int32_t vb31 = ((const int8_t*)w)[3];
-
-      w = (const void*)((uintptr_t)w + 4 * sizeof(int8_t));
-      int32_t vb02 = ((const int8_t*)w)[0];
-      int32_t vb12 = ((const int8_t*)w)[1];
-      int32_t vb22 = ((const int8_t*)w)[2];
-      int32_t vb32 = ((const int8_t*)w)[3];
-
-      w = (const void*)((uintptr_t)w + 4 * sizeof(int8_t));
-      int32_t vb03 = ((const int8_t*)w)[0];
-      int32_t vb13 = ((const int8_t*)w)[1];
-      int32_t vb23 = ((const int8_t*)w)[2];
-      int32_t vb33 = ((const int8_t*)w)[3];
-
-      w = (const void*)((uintptr_t)w + 4 * sizeof(int8_t));
-      int32_t vb04 = ((const int8_t*)w)[0];
-      int32_t vb14 = ((const int8_t*)w)[1];
-      int32_t vb24 = ((const int8_t*)w)[2];
-      int32_t vb34 = ((const int8_t*)w)[3];
-
-      w = (const void*)((uintptr_t)w + 4 * sizeof(int8_t));
-      int32_t vb05 = ((const int8_t*)w)[0];
-      int32_t vb15 = ((const int8_t*)w)[1];
-      int32_t vb25 = ((const int8_t*)w)[2];
-      int32_t vb35 = ((const int8_t*)w)[3];
-
-      w = (const void*)((uintptr_t)w + 4 * sizeof(int8_t));
-      int32_t vb06 = ((const int8_t*)w)[0];
-      int32_t vb16 = ((const int8_t*)w)[1];
-      int32_t vb26 = ((const int8_t*)w)[2];
-      int32_t vb36 = ((const int8_t*)w)[3];
-
-      w = (const void*)((uintptr_t)w + 4 * sizeof(int8_t));
-      int32_t vb07 = ((const int8_t*)w)[0];
-      int32_t vb17 = ((const int8_t*)w)[1];
-      int32_t vb27 = ((const int8_t*)w)[2];
-      int32_t vb37 = ((const int8_t*)w)[3];
-
-      w = (const void*)((uintptr_t)w + 4 * sizeof(int8_t));
-
-      // Multiply-accumulate: 8x4 * 4x8 --> 8x8. The inner size 4 here means
-      // we're computing 4D dot-products, which makes this a model for
-      // a ARMv8.2 dot-product kernel.
-      vacc00 += va00 * vb00;
-      vacc00 += va01 * vb10;
-      vacc00 += va02 * vb20;
-      vacc00 += va03 * vb30;
-      vacc01 += va00 * vb01;
-      vacc01 += va01 * vb11;
-      vacc01 += va02 * vb21;
-      vacc01 += va03 * vb31;
-      vacc02 += va00 * vb02;
-      vacc02 += va01 * vb12;
-      vacc02 += va02 * vb22;
-      vacc02 += va03 * vb32;
-      vacc03 += va00 * vb03;
-      vacc03 += va01 * vb13;
-      vacc03 += va02 * vb23;
-      vacc03 += va03 * vb33;
-      vacc04 += va00 * vb04;
-      vacc04 += va01 * vb14;
-      vacc04 += va02 * vb24;
-      vacc04 += va03 * vb34;
-      vacc05 += va00 * vb05;
-      vacc05 += va01 * vb15;
-      vacc05 += va02 * vb25;
-      vacc05 += va03 * vb35;
-      vacc06 += va00 * vb06;
-      vacc06 += va01 * vb16;
-      vacc06 += va02 * vb26;
-      vacc06 += va03 * vb36;
-      vacc07 += va00 * vb07;
-      vacc07 += va01 * vb17;
-      vacc07 += va02 * vb27;
-      vacc07 += va03 * vb37;
-      vacc10 += va10 * vb00;
-      vacc10 += va11 * vb10;
-      vacc10 += va12 * vb20;
-      vacc10 += va13 * vb30;
-      vacc11 += va10 * vb01;
-      vacc11 += va11 * vb11;
-      vacc11 += va12 * vb21;
-      vacc11 += va13 * vb31;
-      vacc12 += va10 * vb02;
-      vacc12 += va11 * vb12;
-      vacc12 += va12 * vb22;
-      vacc12 += va13 * vb32;
-      vacc13 += va10 * vb03;
-      vacc13 += va11 * vb13;
-      vacc13 += va12 * vb23;
-      vacc13 += va13 * vb33;
-      vacc14 += va10 * vb04;
-      vacc14 += va11 * vb14;
-      vacc14 += va12 * vb24;
-      vacc14 += va13 * vb34;
-      vacc15 += va10 * vb05;
-      vacc15 += va11 * vb15;
-      vacc15 += va12 * vb25;
-      vacc15 += va13 * vb35;
-      vacc16 += va10 * vb06;
-      vacc16 += va11 * vb16;
-      vacc16 += va12 * vb26;
-      vacc16 += va13 * vb36;
-      vacc17 += va10 * vb07;
-      vacc17 += va11 * vb17;
-      vacc17 += va12 * vb27;
-      vacc17 += va13 * vb37;
-      vacc20 += va20 * vb00;
-      vacc20 += va21 * vb10;
-      vacc20 += va22 * vb20;
-      vacc20 += va23 * vb30;
-      vacc21 += va20 * vb01;
-      vacc21 += va21 * vb11;
-      vacc21 += va22 * vb21;
-      vacc21 += va23 * vb31;
-      vacc22 += va20 * vb02;
-      vacc22 += va21 * vb12;
-      vacc22 += va22 * vb22;
-      vacc22 += va23 * vb32;
-      vacc23 += va20 * vb03;
-      vacc23 += va21 * vb13;
-      vacc23 += va22 * vb23;
-      vacc23 += va23 * vb33;
-      vacc24 += va20 * vb04;
-      vacc24 += va21 * vb14;
-      vacc24 += va22 * vb24;
-      vacc24 += va23 * vb34;
-      vacc25 += va20 * vb05;
-      vacc25 += va21 * vb15;
-      vacc25 += va22 * vb25;
-      vacc25 += va23 * vb35;
-      vacc26 += va20 * vb06;
-      vacc26 += va21 * vb16;
-      vacc26 += va22 * vb26;
-      vacc26 += va23 * vb36;
-      vacc27 += va20 * vb07;
-      vacc27 += va21 * vb17;
-      vacc27 += va22 * vb27;
-      vacc27 += va23 * vb37;
-      vacc30 += va30 * vb00;
-      vacc30 += va31 * vb10;
-      vacc30 += va32 * vb20;
-      vacc30 += va33 * vb30;
-      vacc31 += va30 * vb01;
-      vacc31 += va31 * vb11;
-      vacc31 += va32 * vb21;
-      vacc31 += va33 * vb31;
-      vacc32 += va30 * vb02;
-      vacc32 += va31 * vb12;
-      vacc32 += va32 * vb22;
-      vacc32 += va33 * vb32;
-      vacc33 += va30 * vb03;
-      vacc33 += va31 * vb13;
-      vacc33 += va32 * vb23;
-      vacc33 += va33 * vb33;
-      vacc34 += va30 * vb04;
-      vacc34 += va31 * vb14;
-      vacc34 += va32 * vb24;
-      vacc34 += va33 * vb34;
-      vacc35 += va30 * vb05;
-      vacc35 += va31 * vb15;
-      vacc35 += va32 * vb25;
-      vacc35 += va33 * vb35;
-      vacc36 += va30 * vb06;
-      vacc36 += va31 * vb16;
-      vacc36 += va32 * vb26;
-      vacc36 += va33 * vb36;
-      vacc37 += va30 * vb07;
-      vacc37 += va31 * vb17;
-      vacc37 += va32 * vb27;
-      vacc37 += va33 * vb37;
-      vacc40 += va40 * vb00;
-      vacc40 += va41 * vb10;
-      vacc40 += va42 * vb20;
-      vacc40 += va43 * vb30;
-      vacc41 += va40 * vb01;
-      vacc41 += va41 * vb11;
-      vacc41 += va42 * vb21;
-      vacc41 += va43 * vb31;
-      vacc42 += va40 * vb02;
-      vacc42 += va41 * vb12;
-      vacc42 += va42 * vb22;
-      vacc42 += va43 * vb32;
-      vacc43 += va40 * vb03;
-      vacc43 += va41 * vb13;
-      vacc43 += va42 * vb23;
-      vacc43 += va43 * vb33;
-      vacc44 += va40 * vb04;
-      vacc44 += va41 * vb14;
-      vacc44 += va42 * vb24;
-      vacc44 += va43 * vb34;
-      vacc45 += va40 * vb05;
-      vacc45 += va41 * vb15;
-      vacc45 += va42 * vb25;
-      vacc45 += va43 * vb35;
-      vacc46 += va40 * vb06;
-      vacc46 += va41 * vb16;
-      vacc46 += va42 * vb26;
-      vacc46 += va43 * vb36;
-      vacc47 += va40 * vb07;
-      vacc47 += va41 * vb17;
-      vacc47 += va42 * vb27;
-      vacc47 += va43 * vb37;
-      vacc50 += va50 * vb00;
-      vacc50 += va51 * vb10;
-      vacc50 += va52 * vb20;
-      vacc50 += va53 * vb30;
-      vacc51 += va50 * vb01;
-      vacc51 += va51 * vb11;
-      vacc51 += va52 * vb21;
-      vacc51 += va53 * vb31;
-      vacc52 += va50 * vb02;
-      vacc52 += va51 * vb12;
-      vacc52 += va52 * vb22;
-      vacc52 += va53 * vb32;
-      vacc53 += va50 * vb03;
-      vacc53 += va51 * vb13;
-      vacc53 += va52 * vb23;
-      vacc53 += va53 * vb33;
-      vacc54 += va50 * vb04;
-      vacc54 += va51 * vb14;
-      vacc54 += va52 * vb24;
-      vacc54 += va53 * vb34;
-      vacc55 += va50 * vb05;
-      vacc55 += va51 * vb15;
-      vacc55 += va52 * vb25;
-      vacc55 += va53 * vb35;
-      vacc56 += va50 * vb06;
-      vacc56 += va51 * vb16;
-      vacc56 += va52 * vb26;
-      vacc56 += va53 * vb36;
-      vacc57 += va50 * vb07;
-      vacc57 += va51 * vb17;
-      vacc57 += va52 * vb27;
-      vacc57 += va53 * vb37;
-      vacc60 += va60 * vb00;
-      vacc60 += va61 * vb10;
-      vacc60 += va62 * vb20;
-      vacc60 += va63 * vb30;
-      vacc61 += va60 * vb01;
-      vacc61 += va61 * vb11;
-      vacc61 += va62 * vb21;
-      vacc61 += va63 * vb31;
-      vacc62 += va60 * vb02;
-      vacc62 += va61 * vb12;
-      vacc62 += va62 * vb22;
-      vacc62 += va63 * vb32;
-      vacc63 += va60 * vb03;
-      vacc63 += va61 * vb13;
-      vacc63 += va62 * vb23;
-      vacc63 += va63 * vb33;
-      vacc64 += va60 * vb04;
-      vacc64 += va61 * vb14;
-      vacc64 += va62 * vb24;
-      vacc64 += va63 * vb34;
-      vacc65 += va60 * vb05;
-      vacc65 += va61 * vb15;
-      vacc65 += va62 * vb25;
-      vacc65 += va63 * vb35;
-      vacc66 += va60 * vb06;
-      vacc66 += va61 * vb16;
-      vacc66 += va62 * vb26;
-      vacc66 += va63 * vb36;
-      vacc67 += va60 * vb07;
-      vacc67 += va61 * vb17;
-      vacc67 += va62 * vb27;
-      vacc67 += va63 * vb37;
-      vacc70 += va70 * vb00;
-      vacc70 += va71 * vb10;
-      vacc70 += va72 * vb20;
-      vacc70 += va73 * vb30;
-      vacc71 += va70 * vb01;
-      vacc71 += va71 * vb11;
-      vacc71 += va72 * vb21;
-      vacc71 += va73 * vb31;
-      vacc72 += va70 * vb02;
-      vacc72 += va71 * vb12;
-      vacc72 += va72 * vb22;
-      vacc72 += va73 * vb32;
-      vacc73 += va70 * vb03;
-      vacc73 += va71 * vb13;
-      vacc73 += va72 * vb23;
-      vacc73 += va73 * vb33;
-      vacc74 += va70 * vb04;
-      vacc74 += va71 * vb14;
-      vacc74 += va72 * vb24;
-      vacc74 += va73 * vb34;
-      vacc75 += va70 * vb05;
-      vacc75 += va71 * vb15;
-      vacc75 += va72 * vb25;
-      vacc75 += va73 * vb35;
-      vacc76 += va70 * vb06;
-      vacc76 += va71 * vb16;
-      vacc76 += va72 * vb26;
-      vacc76 += va73 * vb36;
-      vacc77 += va70 * vb07;
-      vacc77 += va71 * vb17;
-      vacc77 += va72 * vb27;
-      vacc77 += va73 * vb37;
-
-      k += 4 * sizeof(int8_t);
-    }
-    // End of accumulation loop. The variable `k` contains the amount by which
-    // we advanced the `va` pointers, so we rewind by this amount now.
-    a0 = (const int8_t*)((uintptr_t)a0 - k);
-    a1 = (const int8_t*)((uintptr_t)a1 - k);
-    a2 = (const int8_t*)((uintptr_t)a2 - k);
-    a3 = (const int8_t*)((uintptr_t)a3 - k);
-    a4 = (const int8_t*)((uintptr_t)a4 - k);
-    a5 = (const int8_t*)((uintptr_t)a5 - k);
-    a6 = (const int8_t*)((uintptr_t)a6 - k);
-    a7 = (const int8_t*)((uintptr_t)a7 - k);
-
-    // Post-accumulation work
-
-    const int32_t vmultiplier = params->scalar.multiplier;
-    const int64_t vq31rounding = INT64_C(0x40000000);
-    const int32_t vremainder_mask = params->scalar.remainder_mask;
-    const uint32_t vshift = params->scalar.shift;
-    const int32_t vremainder_threshold = params->scalar.remainder_threshold;
-    const int32_t voutput_min = params->scalar.output_min_less_zero_point;
-    const int32_t voutput_max = params->scalar.output_max_less_zero_point;
-    const int32_t voutput_zero_point = params->scalar.output_zero_point;
-
-    const int64_t vproduct00 = (int64_t)vacc00 * (int64_t)vmultiplier;
-    const int64_t vproduct01 = (int64_t)vacc01 * (int64_t)vmultiplier;
-    const int64_t vproduct02 = (int64_t)vacc02 * (int64_t)vmultiplier;
-    const int64_t vproduct03 = (int64_t)vacc03 * (int64_t)vmultiplier;
-    const int64_t vproduct04 = (int64_t)vacc04 * (int64_t)vmultiplier;
-    const int64_t vproduct05 = (int64_t)vacc05 * (int64_t)vmultiplier;
-    const int64_t vproduct06 = (int64_t)vacc06 * (int64_t)vmultiplier;
-    const int64_t vproduct07 = (int64_t)vacc07 * (int64_t)vmultiplier;
-    const int64_t vproduct10 = (int64_t)vacc10 * (int64_t)vmultiplier;
-    const int64_t vproduct11 = (int64_t)vacc11 * (int64_t)vmultiplier;
-    const int64_t vproduct12 = (int64_t)vacc12 * (int64_t)vmultiplier;
-    const int64_t vproduct13 = (int64_t)vacc13 * (int64_t)vmultiplier;
-    const int64_t vproduct14 = (int64_t)vacc14 * (int64_t)vmultiplier;
-    const int64_t vproduct15 = (int64_t)vacc15 * (int64_t)vmultiplier;
-    const int64_t vproduct16 = (int64_t)vacc16 * (int64_t)vmultiplier;
-    const int64_t vproduct17 = (int64_t)vacc17 * (int64_t)vmultiplier;
-    const int64_t vproduct20 = (int64_t)vacc20 * (int64_t)vmultiplier;
-    const int64_t vproduct21 = (int64_t)vacc21 * (int64_t)vmultiplier;
-    const int64_t vproduct22 = (int64_t)vacc22 * (int64_t)vmultiplier;
-    const int64_t vproduct23 = (int64_t)vacc23 * (int64_t)vmultiplier;
-    const int64_t vproduct24 = (int64_t)vacc24 * (int64_t)vmultiplier;
-    const int64_t vproduct25 = (int64_t)vacc25 * (int64_t)vmultiplier;
-    const int64_t vproduct26 = (int64_t)vacc26 * (int64_t)vmultiplier;
-    const int64_t vproduct27 = (int64_t)vacc27 * (int64_t)vmultiplier;
-    const int64_t vproduct30 = (int64_t)vacc30 * (int64_t)vmultiplier;
-    const int64_t vproduct31 = (int64_t)vacc31 * (int64_t)vmultiplier;
-    const int64_t vproduct32 = (int64_t)vacc32 * (int64_t)vmultiplier;
-    const int64_t vproduct33 = (int64_t)vacc33 * (int64_t)vmultiplier;
-    const int64_t vproduct34 = (int64_t)vacc34 * (int64_t)vmultiplier;
-    const int64_t vproduct35 = (int64_t)vacc35 * (int64_t)vmultiplier;
-    const int64_t vproduct36 = (int64_t)vacc36 * (int64_t)vmultiplier;
-    const int64_t vproduct37 = (int64_t)vacc37 * (int64_t)vmultiplier;
-    const int64_t vproduct40 = (int64_t)vacc40 * (int64_t)vmultiplier;
-    const int64_t vproduct41 = (int64_t)vacc41 * (int64_t)vmultiplier;
-    const int64_t vproduct42 = (int64_t)vacc42 * (int64_t)vmultiplier;
-    const int64_t vproduct43 = (int64_t)vacc43 * (int64_t)vmultiplier;
-    const int64_t vproduct44 = (int64_t)vacc44 * (int64_t)vmultiplier;
-    const int64_t vproduct45 = (int64_t)vacc45 * (int64_t)vmultiplier;
-    const int64_t vproduct46 = (int64_t)vacc46 * (int64_t)vmultiplier;
-    const int64_t vproduct47 = (int64_t)vacc47 * (int64_t)vmultiplier;
-    const int64_t vproduct50 = (int64_t)vacc50 * (int64_t)vmultiplier;
-    const int64_t vproduct51 = (int64_t)vacc51 * (int64_t)vmultiplier;
-    const int64_t vproduct52 = (int64_t)vacc52 * (int64_t)vmultiplier;
-    const int64_t vproduct53 = (int64_t)vacc53 * (int64_t)vmultiplier;
-    const int64_t vproduct54 = (int64_t)vacc54 * (int64_t)vmultiplier;
-    const int64_t vproduct55 = (int64_t)vacc55 * (int64_t)vmultiplier;
-    const int64_t vproduct56 = (int64_t)vacc56 * (int64_t)vmultiplier;
-    const int64_t vproduct57 = (int64_t)vacc57 * (int64_t)vmultiplier;
-    const int64_t vproduct60 = (int64_t)vacc60 * (int64_t)vmultiplier;
-    const int64_t vproduct61 = (int64_t)vacc61 * (int64_t)vmultiplier;
-    const int64_t vproduct62 = (int64_t)vacc62 * (int64_t)vmultiplier;
-    const int64_t vproduct63 = (int64_t)vacc63 * (int64_t)vmultiplier;
-    const int64_t vproduct64 = (int64_t)vacc64 * (int64_t)vmultiplier;
-    const int64_t vproduct65 = (int64_t)vacc65 * (int64_t)vmultiplier;
-    const int64_t vproduct66 = (int64_t)vacc66 * (int64_t)vmultiplier;
-    const int64_t vproduct67 = (int64_t)vacc67 * (int64_t)vmultiplier;
-    const int64_t vproduct70 = (int64_t)vacc70 * (int64_t)vmultiplier;
-    const int64_t vproduct71 = (int64_t)vacc71 * (int64_t)vmultiplier;
-    const int64_t vproduct72 = (int64_t)vacc72 * (int64_t)vmultiplier;
-    const int64_t vproduct73 = (int64_t)vacc73 * (int64_t)vmultiplier;
-    const int64_t vproduct74 = (int64_t)vacc74 * (int64_t)vmultiplier;
-    const int64_t vproduct75 = (int64_t)vacc75 * (int64_t)vmultiplier;
-    const int64_t vproduct76 = (int64_t)vacc76 * (int64_t)vmultiplier;
-    const int64_t vproduct77 = (int64_t)vacc77 * (int64_t)vmultiplier;
-
-    const int32_t vq31product00 = (int32_t)(uint32_t)((uint64_t)(vproduct00 + vq31rounding) >> 31);
-    const int32_t vq31product01 = (int32_t)(uint32_t)((uint64_t)(vproduct01 + vq31rounding) >> 31);
-    const int32_t vq31product02 = (int32_t)(uint32_t)((uint64_t)(vproduct02 + vq31rounding) >> 31);
-    const int32_t vq31product03 = (int32_t)(uint32_t)((uint64_t)(vproduct03 + vq31rounding) >> 31);
-    const int32_t vq31product04 = (int32_t)(uint32_t)((uint64_t)(vproduct04 + vq31rounding) >> 31);
-    const int32_t vq31product05 = (int32_t)(uint32_t)((uint64_t)(vproduct05 + vq31rounding) >> 31);
-    const int32_t vq31product06 = (int32_t)(uint32_t)((uint64_t)(vproduct06 + vq31rounding) >> 31);
-    const int32_t vq31product07 = (int32_t)(uint32_t)((uint64_t)(vproduct07 + vq31rounding) >> 31);
-    const int32_t vq31product10 = (int32_t)(uint32_t)((uint64_t)(vproduct10 + vq31rounding) >> 31);
-    const int32_t vq31product11 = (int32_t)(uint32_t)((uint64_t)(vproduct11 + vq31rounding) >> 31);
-    const int32_t vq31product12 = (int32_t)(uint32_t)((uint64_t)(vproduct12 + vq31rounding) >> 31);
-    const int32_t vq31product13 = (int32_t)(uint32_t)((uint64_t)(vproduct13 + vq31rounding) >> 31);
-    const int32_t vq31product14 = (int32_t)(uint32_t)((uint64_t)(vproduct14 + vq31rounding) >> 31);
-    const int32_t vq31product15 = (int32_t)(uint32_t)((uint64_t)(vproduct15 + vq31rounding) >> 31);
-    const int32_t vq31product16 = (int32_t)(uint32_t)((uint64_t)(vproduct16 + vq31rounding) >> 31);
-    const int32_t vq31product17 = (int32_t)(uint32_t)((uint64_t)(vproduct17 + vq31rounding) >> 31);
-    const int32_t vq31product20 = (int32_t)(uint32_t)((uint64_t)(vproduct20 + vq31rounding) >> 31);
-    const int32_t vq31product21 = (int32_t)(uint32_t)((uint64_t)(vproduct21 + vq31rounding) >> 31);
-    const int32_t vq31product22 = (int32_t)(uint32_t)((uint64_t)(vproduct22 + vq31rounding) >> 31);
-    const int32_t vq31product23 = (int32_t)(uint32_t)((uint64_t)(vproduct23 + vq31rounding) >> 31);
-    const int32_t vq31product24 = (int32_t)(uint32_t)((uint64_t)(vproduct24 + vq31rounding) >> 31);
-    const int32_t vq31product25 = (int32_t)(uint32_t)((uint64_t)(vproduct25 + vq31rounding) >> 31);
-    const int32_t vq31product26 = (int32_t)(uint32_t)((uint64_t)(vproduct26 + vq31rounding) >> 31);
-    const int32_t vq31product27 = (int32_t)(uint32_t)((uint64_t)(vproduct27 + vq31rounding) >> 31);
-    const int32_t vq31product30 = (int32_t)(uint32_t)((uint64_t)(vproduct30 + vq31rounding) >> 31);
-    const int32_t vq31product31 = (int32_t)(uint32_t)((uint64_t)(vproduct31 + vq31rounding) >> 31);
-    const int32_t vq31product32 = (int32_t)(uint32_t)((uint64_t)(vproduct32 + vq31rounding) >> 31);
-    const int32_t vq31product33 = (int32_t)(uint32_t)((uint64_t)(vproduct33 + vq31rounding) >> 31);
-    const int32_t vq31product34 = (int32_t)(uint32_t)((uint64_t)(vproduct34 + vq31rounding) >> 31);
-    const int32_t vq31product35 = (int32_t)(uint32_t)((uint64_t)(vproduct35 + vq31rounding) >> 31);
-    const int32_t vq31product36 = (int32_t)(uint32_t)((uint64_t)(vproduct36 + vq31rounding) >> 31);
-    const int32_t vq31product37 = (int32_t)(uint32_t)((uint64_t)(vproduct37 + vq31rounding) >> 31);
-    const int32_t vq31product40 = (int32_t)(uint32_t)((uint64_t)(vproduct40 + vq31rounding) >> 31);
-    const int32_t vq31product41 = (int32_t)(uint32_t)((uint64_t)(vproduct41 + vq31rounding) >> 31);
-    const int32_t vq31product42 = (int32_t)(uint32_t)((uint64_t)(vproduct42 + vq31rounding) >> 31);
-    const int32_t vq31product43 = (int32_t)(uint32_t)((uint64_t)(vproduct43 + vq31rounding) >> 31);
-    const int32_t vq31product44 = (int32_t)(uint32_t)((uint64_t)(vproduct44 + vq31rounding) >> 31);
-    const int32_t vq31product45 = (int32_t)(uint32_t)((uint64_t)(vproduct45 + vq31rounding) >> 31);
-    const int32_t vq31product46 = (int32_t)(uint32_t)((uint64_t)(vproduct46 + vq31rounding) >> 31);
-    const int32_t vq31product47 = (int32_t)(uint32_t)((uint64_t)(vproduct47 + vq31rounding) >> 31);
-    const int32_t vq31product50 = (int32_t)(uint32_t)((uint64_t)(vproduct50 + vq31rounding) >> 31);
-    const int32_t vq31product51 = (int32_t)(uint32_t)((uint64_t)(vproduct51 + vq31rounding) >> 31);
-    const int32_t vq31product52 = (int32_t)(uint32_t)((uint64_t)(vproduct52 + vq31rounding) >> 31);
-    const int32_t vq31product53 = (int32_t)(uint32_t)((uint64_t)(vproduct53 + vq31rounding) >> 31);
-    const int32_t vq31product54 = (int32_t)(uint32_t)((uint64_t)(vproduct54 + vq31rounding) >> 31);
-    const int32_t vq31product55 = (int32_t)(uint32_t)((uint64_t)(vproduct55 + vq31rounding) >> 31);
-    const int32_t vq31product56 = (int32_t)(uint32_t)((uint64_t)(vproduct56 + vq31rounding) >> 31);
-    const int32_t vq31product57 = (int32_t)(uint32_t)((uint64_t)(vproduct57 + vq31rounding) >> 31);
-    const int32_t vq31product60 = (int32_t)(uint32_t)((uint64_t)(vproduct60 + vq31rounding) >> 31);
-    const int32_t vq31product61 = (int32_t)(uint32_t)((uint64_t)(vproduct61 + vq31rounding) >> 31);
-    const int32_t vq31product62 = (int32_t)(uint32_t)((uint64_t)(vproduct62 + vq31rounding) >> 31);
-    const int32_t vq31product63 = (int32_t)(uint32_t)((uint64_t)(vproduct63 + vq31rounding) >> 31);
-    const int32_t vq31product64 = (int32_t)(uint32_t)((uint64_t)(vproduct64 + vq31rounding) >> 31);
-    const int32_t vq31product65 = (int32_t)(uint32_t)((uint64_t)(vproduct65 + vq31rounding) >> 31);
-    const int32_t vq31product66 = (int32_t)(uint32_t)((uint64_t)(vproduct66 + vq31rounding) >> 31);
-    const int32_t vq31product67 = (int32_t)(uint32_t)((uint64_t)(vproduct67 + vq31rounding) >> 31);
-    const int32_t vq31product70 = (int32_t)(uint32_t)((uint64_t)(vproduct70 + vq31rounding) >> 31);
-    const int32_t vq31product71 = (int32_t)(uint32_t)((uint64_t)(vproduct71 + vq31rounding) >> 31);
-    const int32_t vq31product72 = (int32_t)(uint32_t)((uint64_t)(vproduct72 + vq31rounding) >> 31);
-    const int32_t vq31product73 = (int32_t)(uint32_t)((uint64_t)(vproduct73 + vq31rounding) >> 31);
-    const int32_t vq31product74 = (int32_t)(uint32_t)((uint64_t)(vproduct74 + vq31rounding) >> 31);
-    const int32_t vq31product75 = (int32_t)(uint32_t)((uint64_t)(vproduct75 + vq31rounding) >> 31);
-    const int32_t vq31product76 = (int32_t)(uint32_t)((uint64_t)(vproduct76 + vq31rounding) >> 31);
-    const int32_t vq31product77 = (int32_t)(uint32_t)((uint64_t)(vproduct77 + vq31rounding) >> 31);
-
-    const int32_t vremainder00 = (vq31product00 & vremainder_mask) - (int32_t)(vq31product00 < 0);
-    const int32_t vremainder01 = (vq31product01 & vremainder_mask) - (int32_t)(vq31product01 < 0);
-    const int32_t vremainder02 = (vq31product02 & vremainder_mask) - (int32_t)(vq31product02 < 0);
-    const int32_t vremainder03 = (vq31product03 & vremainder_mask) - (int32_t)(vq31product03 < 0);
-    const int32_t vremainder04 = (vq31product04 & vremainder_mask) - (int32_t)(vq31product04 < 0);
-    const int32_t vremainder05 = (vq31product05 & vremainder_mask) - (int32_t)(vq31product05 < 0);
-    const int32_t vremainder06 = (vq31product06 & vremainder_mask) - (int32_t)(vq31product06 < 0);
-    const int32_t vremainder07 = (vq31product07 & vremainder_mask) - (int32_t)(vq31product07 < 0);
-    const int32_t vremainder10 = (vq31product10 & vremainder_mask) - (int32_t)(vq31product10 < 0);
-    const int32_t vremainder11 = (vq31product11 & vremainder_mask) - (int32_t)(vq31product11 < 0);
-    const int32_t vremainder12 = (vq31product12 & vremainder_mask) - (int32_t)(vq31product12 < 0);
-    const int32_t vremainder13 = (vq31product13 & vremainder_mask) - (int32_t)(vq31product13 < 0);
-    const int32_t vremainder14 = (vq31product14 & vremainder_mask) - (int32_t)(vq31product14 < 0);
-    const int32_t vremainder15 = (vq31product15 & vremainder_mask) - (int32_t)(vq31product15 < 0);
-    const int32_t vremainder16 = (vq31product16 & vremainder_mask) - (int32_t)(vq31product16 < 0);
-    const int32_t vremainder17 = (vq31product17 & vremainder_mask) - (int32_t)(vq31product17 < 0);
-    const int32_t vremainder20 = (vq31product20 & vremainder_mask) - (int32_t)(vq31product20 < 0);
-    const int32_t vremainder21 = (vq31product21 & vremainder_mask) - (int32_t)(vq31product21 < 0);
-    const int32_t vremainder22 = (vq31product22 & vremainder_mask) - (int32_t)(vq31product22 < 0);
-    const int32_t vremainder23 = (vq31product23 & vremainder_mask) - (int32_t)(vq31product23 < 0);
-    const int32_t vremainder24 = (vq31product24 & vremainder_mask) - (int32_t)(vq31product24 < 0);
-    const int32_t vremainder25 = (vq31product25 & vremainder_mask) - (int32_t)(vq31product25 < 0);
-    const int32_t vremainder26 = (vq31product26 & vremainder_mask) - (int32_t)(vq31product26 < 0);
-    const int32_t vremainder27 = (vq31product27 & vremainder_mask) - (int32_t)(vq31product27 < 0);
-    const int32_t vremainder30 = (vq31product30 & vremainder_mask) - (int32_t)(vq31product30 < 0);
-    const int32_t vremainder31 = (vq31product31 & vremainder_mask) - (int32_t)(vq31product31 < 0);
-    const int32_t vremainder32 = (vq31product32 & vremainder_mask) - (int32_t)(vq31product32 < 0);
-    const int32_t vremainder33 = (vq31product33 & vremainder_mask) - (int32_t)(vq31product33 < 0);
-    const int32_t vremainder34 = (vq31product34 & vremainder_mask) - (int32_t)(vq31product34 < 0);
-    const int32_t vremainder35 = (vq31product35 & vremainder_mask) - (int32_t)(vq31product35 < 0);
-    const int32_t vremainder36 = (vq31product36 & vremainder_mask) - (int32_t)(vq31product36 < 0);
-    const int32_t vremainder37 = (vq31product37 & vremainder_mask) - (int32_t)(vq31product37 < 0);
-    const int32_t vremainder40 = (vq31product40 & vremainder_mask) - (int32_t)(vq31product40 < 0);
-    const int32_t vremainder41 = (vq31product41 & vremainder_mask) - (int32_t)(vq31product41 < 0);
-    const int32_t vremainder42 = (vq31product42 & vremainder_mask) - (int32_t)(vq31product42 < 0);
-    const int32_t vremainder43 = (vq31product43 & vremainder_mask) - (int32_t)(vq31product43 < 0);
-    const int32_t vremainder44 = (vq31product44 & vremainder_mask) - (int32_t)(vq31product44 < 0);
-    const int32_t vremainder45 = (vq31product45 & vremainder_mask) - (int32_t)(vq31product45 < 0);
-    const int32_t vremainder46 = (vq31product46 & vremainder_mask) - (int32_t)(vq31product46 < 0);
-    const int32_t vremainder47 = (vq31product47 & vremainder_mask) - (int32_t)(vq31product47 < 0);
-    const int32_t vremainder50 = (vq31product50 & vremainder_mask) - (int32_t)(vq31product50 < 0);
-    const int32_t vremainder51 = (vq31product51 & vremainder_mask) - (int32_t)(vq31product51 < 0);
-    const int32_t vremainder52 = (vq31product52 & vremainder_mask) - (int32_t)(vq31product52 < 0);
-    const int32_t vremainder53 = (vq31product53 & vremainder_mask) - (int32_t)(vq31product53 < 0);
-    const int32_t vremainder54 = (vq31product54 & vremainder_mask) - (int32_t)(vq31product54 < 0);
-    const int32_t vremainder55 = (vq31product55 & vremainder_mask) - (int32_t)(vq31product55 < 0);
-    const int32_t vremainder56 = (vq31product56 & vremainder_mask) - (int32_t)(vq31product56 < 0);
-    const int32_t vremainder57 = (vq31product57 & vremainder_mask) - (int32_t)(vq31product57 < 0);
-    const int32_t vremainder60 = (vq31product60 & vremainder_mask) - (int32_t)(vq31product60 < 0);
-    const int32_t vremainder61 = (vq31product61 & vremainder_mask) - (int32_t)(vq31product61 < 0);
-    const int32_t vremainder62 = (vq31product62 & vremainder_mask) - (int32_t)(vq31product62 < 0);
-    const int32_t vremainder63 = (vq31product63 & vremainder_mask) - (int32_t)(vq31product63 < 0);
-    const int32_t vremainder64 = (vq31product64 & vremainder_mask) - (int32_t)(vq31product64 < 0);
-    const int32_t vremainder65 = (vq31product65 & vremainder_mask) - (int32_t)(vq31product65 < 0);
-    const int32_t vremainder66 = (vq31product66 & vremainder_mask) - (int32_t)(vq31product66 < 0);
-    const int32_t vremainder67 = (vq31product67 & vremainder_mask) - (int32_t)(vq31product67 < 0);
-    const int32_t vremainder70 = (vq31product70 & vremainder_mask) - (int32_t)(vq31product70 < 0);
-    const int32_t vremainder71 = (vq31product71 & vremainder_mask) - (int32_t)(vq31product71 < 0);
-    const int32_t vremainder72 = (vq31product72 & vremainder_mask) - (int32_t)(vq31product72 < 0);
-    const int32_t vremainder73 = (vq31product73 & vremainder_mask) - (int32_t)(vq31product73 < 0);
-    const int32_t vremainder74 = (vq31product74 & vremainder_mask) - (int32_t)(vq31product74 < 0);
-    const int32_t vremainder75 = (vq31product75 & vremainder_mask) - (int32_t)(vq31product75 < 0);
-    const int32_t vremainder76 = (vq31product76 & vremainder_mask) - (int32_t)(vq31product76 < 0);
-    const int32_t vremainder77 = (vq31product77 & vremainder_mask) - (int32_t)(vq31product77 < 0);
-
-    int32_t vout00 = asr_s32(vq31product00, vshift) + (int32_t)(vremainder00 > vremainder_threshold);
-    int32_t vout01 = asr_s32(vq31product01, vshift) + (int32_t)(vremainder01 > vremainder_threshold);
-    int32_t vout02 = asr_s32(vq31product02, vshift) + (int32_t)(vremainder02 > vremainder_threshold);
-    int32_t vout03 = asr_s32(vq31product03, vshift) + (int32_t)(vremainder03 > vremainder_threshold);
-    int32_t vout04 = asr_s32(vq31product04, vshift) + (int32_t)(vremainder04 > vremainder_threshold);
-    int32_t vout05 = asr_s32(vq31product05, vshift) + (int32_t)(vremainder05 > vremainder_threshold);
-    int32_t vout06 = asr_s32(vq31product06, vshift) + (int32_t)(vremainder06 > vremainder_threshold);
-    int32_t vout07 = asr_s32(vq31product07, vshift) + (int32_t)(vremainder07 > vremainder_threshold);
-    int32_t vout10 = asr_s32(vq31product10, vshift) + (int32_t)(vremainder10 > vremainder_threshold);
-    int32_t vout11 = asr_s32(vq31product11, vshift) + (int32_t)(vremainder11 > vremainder_threshold);
-    int32_t vout12 = asr_s32(vq31product12, vshift) + (int32_t)(vremainder12 > vremainder_threshold);
-    int32_t vout13 = asr_s32(vq31product13, vshift) + (int32_t)(vremainder13 > vremainder_threshold);
-    int32_t vout14 = asr_s32(vq31product14, vshift) + (int32_t)(vremainder14 > vremainder_threshold);
-    int32_t vout15 = asr_s32(vq31product15, vshift) + (int32_t)(vremainder15 > vremainder_threshold);
-    int32_t vout16 = asr_s32(vq31product16, vshift) + (int32_t)(vremainder16 > vremainder_threshold);
-    int32_t vout17 = asr_s32(vq31product17, vshift) + (int32_t)(vremainder17 > vremainder_threshold);
-    int32_t vout20 = asr_s32(vq31product20, vshift) + (int32_t)(vremainder20 > vremainder_threshold);
-    int32_t vout21 = asr_s32(vq31product21, vshift) + (int32_t)(vremainder21 > vremainder_threshold);
-    int32_t vout22 = asr_s32(vq31product22, vshift) + (int32_t)(vremainder22 > vremainder_threshold);
-    int32_t vout23 = asr_s32(vq31product23, vshift) + (int32_t)(vremainder23 > vremainder_threshold);
-    int32_t vout24 = asr_s32(vq31product24, vshift) + (int32_t)(vremainder24 > vremainder_threshold);
-    int32_t vout25 = asr_s32(vq31product25, vshift) + (int32_t)(vremainder25 > vremainder_threshold);
-    int32_t vout26 = asr_s32(vq31product26, vshift) + (int32_t)(vremainder26 > vremainder_threshold);
-    int32_t vout27 = asr_s32(vq31product27, vshift) + (int32_t)(vremainder27 > vremainder_threshold);
-    int32_t vout30 = asr_s32(vq31product30, vshift) + (int32_t)(vremainder30 > vremainder_threshold);
-    int32_t vout31 = asr_s32(vq31product31, vshift) + (int32_t)(vremainder31 > vremainder_threshold);
-    int32_t vout32 = asr_s32(vq31product32, vshift) + (int32_t)(vremainder32 > vremainder_threshold);
-    int32_t vout33 = asr_s32(vq31product33, vshift) + (int32_t)(vremainder33 > vremainder_threshold);
-    int32_t vout34 = asr_s32(vq31product34, vshift) + (int32_t)(vremainder34 > vremainder_threshold);
-    int32_t vout35 = asr_s32(vq31product35, vshift) + (int32_t)(vremainder35 > vremainder_threshold);
-    int32_t vout36 = asr_s32(vq31product36, vshift) + (int32_t)(vremainder36 > vremainder_threshold);
-    int32_t vout37 = asr_s32(vq31product37, vshift) + (int32_t)(vremainder37 > vremainder_threshold);
-    int32_t vout40 = asr_s32(vq31product40, vshift) + (int32_t)(vremainder40 > vremainder_threshold);
-    int32_t vout41 = asr_s32(vq31product41, vshift) + (int32_t)(vremainder41 > vremainder_threshold);
-    int32_t vout42 = asr_s32(vq31product42, vshift) + (int32_t)(vremainder42 > vremainder_threshold);
-    int32_t vout43 = asr_s32(vq31product43, vshift) + (int32_t)(vremainder43 > vremainder_threshold);
-    int32_t vout44 = asr_s32(vq31product44, vshift) + (int32_t)(vremainder44 > vremainder_threshold);
-    int32_t vout45 = asr_s32(vq31product45, vshift) + (int32_t)(vremainder45 > vremainder_threshold);
-    int32_t vout46 = asr_s32(vq31product46, vshift) + (int32_t)(vremainder46 > vremainder_threshold);
-    int32_t vout47 = asr_s32(vq31product47, vshift) + (int32_t)(vremainder47 > vremainder_threshold);
-    int32_t vout50 = asr_s32(vq31product50, vshift) + (int32_t)(vremainder50 > vremainder_threshold);
-    int32_t vout51 = asr_s32(vq31product51, vshift) + (int32_t)(vremainder51 > vremainder_threshold);
-    int32_t vout52 = asr_s32(vq31product52, vshift) + (int32_t)(vremainder52 > vremainder_threshold);
-    int32_t vout53 = asr_s32(vq31product53, vshift) + (int32_t)(vremainder53 > vremainder_threshold);
-    int32_t vout54 = asr_s32(vq31product54, vshift) + (int32_t)(vremainder54 > vremainder_threshold);
-    int32_t vout55 = asr_s32(vq31product55, vshift) + (int32_t)(vremainder55 > vremainder_threshold);
-    int32_t vout56 = asr_s32(vq31product56, vshift) + (int32_t)(vremainder56 > vremainder_threshold);
-    int32_t vout57 = asr_s32(vq31product57, vshift) + (int32_t)(vremainder57 > vremainder_threshold);
-    int32_t vout60 = asr_s32(vq31product60, vshift) + (int32_t)(vremainder60 > vremainder_threshold);
-    int32_t vout61 = asr_s32(vq31product61, vshift) + (int32_t)(vremainder61 > vremainder_threshold);
-    int32_t vout62 = asr_s32(vq31product62, vshift) + (int32_t)(vremainder62 > vremainder_threshold);
-    int32_t vout63 = asr_s32(vq31product63, vshift) + (int32_t)(vremainder63 > vremainder_threshold);
-    int32_t vout64 = asr_s32(vq31product64, vshift) + (int32_t)(vremainder64 > vremainder_threshold);
-    int32_t vout65 = asr_s32(vq31product65, vshift) + (int32_t)(vremainder65 > vremainder_threshold);
-    int32_t vout66 = asr_s32(vq31product66, vshift) + (int32_t)(vremainder66 > vremainder_threshold);
-    int32_t vout67 = asr_s32(vq31product67, vshift) + (int32_t)(vremainder67 > vremainder_threshold);
-    int32_t vout70 = asr_s32(vq31product70, vshift) + (int32_t)(vremainder70 > vremainder_threshold);
-    int32_t vout71 = asr_s32(vq31product71, vshift) + (int32_t)(vremainder71 > vremainder_threshold);
-    int32_t vout72 = asr_s32(vq31product72, vshift) + (int32_t)(vremainder72 > vremainder_threshold);
-    int32_t vout73 = asr_s32(vq31product73, vshift) + (int32_t)(vremainder73 > vremainder_threshold);
-    int32_t vout74 = asr_s32(vq31product74, vshift) + (int32_t)(vremainder74 > vremainder_threshold);
-    int32_t vout75 = asr_s32(vq31product75, vshift) + (int32_t)(vremainder75 > vremainder_threshold);
-    int32_t vout76 = asr_s32(vq31product76, vshift) + (int32_t)(vremainder76 > vremainder_threshold);
-    int32_t vout77 = asr_s32(vq31product77, vshift) + (int32_t)(vremainder77 > vremainder_threshold);
-
-    vout00 = vout00 < voutput_min ? voutput_min : vout00;
-    vout01 = vout01 < voutput_min ? voutput_min : vout01;
-    vout02 = vout02 < voutput_min ? voutput_min : vout02;
-    vout03 = vout03 < voutput_min ? voutput_min : vout03;
-    vout04 = vout04 < voutput_min ? voutput_min : vout04;
-    vout05 = vout05 < voutput_min ? voutput_min : vout05;
-    vout06 = vout06 < voutput_min ? voutput_min : vout06;
-    vout07 = vout07 < voutput_min ? voutput_min : vout07;
-    vout10 = vout10 < voutput_min ? voutput_min : vout10;
-    vout11 = vout11 < voutput_min ? voutput_min : vout11;
-    vout12 = vout12 < voutput_min ? voutput_min : vout12;
-    vout13 = vout13 < voutput_min ? voutput_min : vout13;
-    vout14 = vout14 < voutput_min ? voutput_min : vout14;
-    vout15 = vout15 < voutput_min ? voutput_min : vout15;
-    vout16 = vout16 < voutput_min ? voutput_min : vout16;
-    vout17 = vout17 < voutput_min ? voutput_min : vout17;
-    vout20 = vout20 < voutput_min ? voutput_min : vout20;
-    vout21 = vout21 < voutput_min ? voutput_min : vout21;
-    vout22 = vout22 < voutput_min ? voutput_min : vout22;
-    vout23 = vout23 < voutput_min ? voutput_min : vout23;
-    vout24 = vout24 < voutput_min ? voutput_min : vout24;
-    vout25 = vout25 < voutput_min ? voutput_min : vout25;
-    vout26 = vout26 < voutput_min ? voutput_min : vout26;
-    vout27 = vout27 < voutput_min ? voutput_min : vout27;
-    vout30 = vout30 < voutput_min ? voutput_min : vout30;
-    vout31 = vout31 < voutput_min ? voutput_min : vout31;
-    vout32 = vout32 < voutput_min ? voutput_min : vout32;
-    vout33 = vout33 < voutput_min ? voutput_min : vout33;
-    vout34 = vout34 < voutput_min ? voutput_min : vout34;
-    vout35 = vout35 < voutput_min ? voutput_min : vout35;
-    vout36 = vout36 < voutput_min ? voutput_min : vout36;
-    vout37 = vout37 < voutput_min ? voutput_min : vout37;
-    vout40 = vout40 < voutput_min ? voutput_min : vout40;
-    vout41 = vout41 < voutput_min ? voutput_min : vout41;
-    vout42 = vout42 < voutput_min ? voutput_min : vout42;
-    vout43 = vout43 < voutput_min ? voutput_min : vout43;
-    vout44 = vout44 < voutput_min ? voutput_min : vout44;
-    vout45 = vout45 < voutput_min ? voutput_min : vout45;
-    vout46 = vout46 < voutput_min ? voutput_min : vout46;
-    vout47 = vout47 < voutput_min ? voutput_min : vout47;
-    vout50 = vout50 < voutput_min ? voutput_min : vout50;
-    vout51 = vout51 < voutput_min ? voutput_min : vout51;
-    vout52 = vout52 < voutput_min ? voutput_min : vout52;
-    vout53 = vout53 < voutput_min ? voutput_min : vout53;
-    vout54 = vout54 < voutput_min ? voutput_min : vout54;
-    vout55 = vout55 < voutput_min ? voutput_min : vout55;
-    vout56 = vout56 < voutput_min ? voutput_min : vout56;
-    vout57 = vout57 < voutput_min ? voutput_min : vout57;
-    vout60 = vout60 < voutput_min ? voutput_min : vout60;
-    vout61 = vout61 < voutput_min ? voutput_min : vout61;
-    vout62 = vout62 < voutput_min ? voutput_min : vout62;
-    vout63 = vout63 < voutput_min ? voutput_min : vout63;
-    vout64 = vout64 < voutput_min ? voutput_min : vout64;
-    vout65 = vout65 < voutput_min ? voutput_min : vout65;
-    vout66 = vout66 < voutput_min ? voutput_min : vout66;
-    vout67 = vout67 < voutput_min ? voutput_min : vout67;
-    vout70 = vout70 < voutput_min ? voutput_min : vout70;
-    vout71 = vout71 < voutput_min ? voutput_min : vout71;
-    vout72 = vout72 < voutput_min ? voutput_min : vout72;
-    vout73 = vout73 < voutput_min ? voutput_min : vout73;
-    vout74 = vout74 < voutput_min ? voutput_min : vout74;
-    vout75 = vout75 < voutput_min ? voutput_min : vout75;
-    vout76 = vout76 < voutput_min ? voutput_min : vout76;
-    vout77 = vout77 < voutput_min ? voutput_min : vout77;
-
-    vout00 = vout00 > voutput_max ? voutput_max : vout00;
-    vout01 = vout01 > voutput_max ? voutput_max : vout01;
-    vout02 = vout02 > voutput_max ? voutput_max : vout02;
-    vout03 = vout03 > voutput_max ? voutput_max : vout03;
-    vout04 = vout04 > voutput_max ? voutput_max : vout04;
-    vout05 = vout05 > voutput_max ? voutput_max : vout05;
-    vout06 = vout06 > voutput_max ? voutput_max : vout06;
-    vout07 = vout07 > voutput_max ? voutput_max : vout07;
-    vout10 = vout10 > voutput_max ? voutput_max : vout10;
-    vout11 = vout11 > voutput_max ? voutput_max : vout11;
-    vout12 = vout12 > voutput_max ? voutput_max : vout12;
-    vout13 = vout13 > voutput_max ? voutput_max : vout13;
-    vout14 = vout14 > voutput_max ? voutput_max : vout14;
-    vout15 = vout15 > voutput_max ? voutput_max : vout15;
-    vout16 = vout16 > voutput_max ? voutput_max : vout16;
-    vout17 = vout17 > voutput_max ? voutput_max : vout17;
-    vout20 = vout20 > voutput_max ? voutput_max : vout20;
-    vout21 = vout21 > voutput_max ? voutput_max : vout21;
-    vout22 = vout22 > voutput_max ? voutput_max : vout22;
-    vout23 = vout23 > voutput_max ? voutput_max : vout23;
-    vout24 = vout24 > voutput_max ? voutput_max : vout24;
-    vout25 = vout25 > voutput_max ? voutput_max : vout25;
-    vout26 = vout26 > voutput_max ? voutput_max : vout26;
-    vout27 = vout27 > voutput_max ? voutput_max : vout27;
-    vout30 = vout30 > voutput_max ? voutput_max : vout30;
-    vout31 = vout31 > voutput_max ? voutput_max : vout31;
-    vout32 = vout32 > voutput_max ? voutput_max : vout32;
-    vout33 = vout33 > voutput_max ? voutput_max : vout33;
-    vout34 = vout34 > voutput_max ? voutput_max : vout34;
-    vout35 = vout35 > voutput_max ? voutput_max : vout35;
-    vout36 = vout36 > voutput_max ? voutput_max : vout36;
-    vout37 = vout37 > voutput_max ? voutput_max : vout37;
-    vout40 = vout40 > voutput_max ? voutput_max : vout40;
-    vout41 = vout41 > voutput_max ? voutput_max : vout41;
-    vout42 = vout42 > voutput_max ? voutput_max : vout42;
-    vout43 = vout43 > voutput_max ? voutput_max : vout43;
-    vout44 = vout44 > voutput_max ? voutput_max : vout44;
-    vout45 = vout45 > voutput_max ? voutput_max : vout45;
-    vout46 = vout46 > voutput_max ? voutput_max : vout46;
-    vout47 = vout47 > voutput_max ? voutput_max : vout47;
-    vout50 = vout50 > voutput_max ? voutput_max : vout50;
-    vout51 = vout51 > voutput_max ? voutput_max : vout51;
-    vout52 = vout52 > voutput_max ? voutput_max : vout52;
-    vout53 = vout53 > voutput_max ? voutput_max : vout53;
-    vout54 = vout54 > voutput_max ? voutput_max : vout54;
-    vout55 = vout55 > voutput_max ? voutput_max : vout55;
-    vout56 = vout56 > voutput_max ? voutput_max : vout56;
-    vout57 = vout57 > voutput_max ? voutput_max : vout57;
-    vout60 = vout60 > voutput_max ? voutput_max : vout60;
-    vout61 = vout61 > voutput_max ? voutput_max : vout61;
-    vout62 = vout62 > voutput_max ? voutput_max : vout62;
-    vout63 = vout63 > voutput_max ? voutput_max : vout63;
-    vout64 = vout64 > voutput_max ? voutput_max : vout64;
-    vout65 = vout65 > voutput_max ? voutput_max : vout65;
-    vout66 = vout66 > voutput_max ? voutput_max : vout66;
-    vout67 = vout67 > voutput_max ? voutput_max : vout67;
-    vout70 = vout70 > voutput_max ? voutput_max : vout70;
-    vout71 = vout71 > voutput_max ? voutput_max : vout71;
-    vout72 = vout72 > voutput_max ? voutput_max : vout72;
-    vout73 = vout73 > voutput_max ? voutput_max : vout73;
-    vout74 = vout74 > voutput_max ? voutput_max : vout74;
-    vout75 = vout75 > voutput_max ? voutput_max : vout75;
-    vout76 = vout76 > voutput_max ? voutput_max : vout76;
-    vout77 = vout77 > voutput_max ? voutput_max : vout77;
-
-    vout00 += voutput_zero_point;
-    vout01 += voutput_zero_point;
-    vout02 += voutput_zero_point;
-    vout03 += voutput_zero_point;
-    vout04 += voutput_zero_point;
-    vout05 += voutput_zero_point;
-    vout06 += voutput_zero_point;
-    vout07 += voutput_zero_point;
-    vout10 += voutput_zero_point;
-    vout11 += voutput_zero_point;
-    vout12 += voutput_zero_point;
-    vout13 += voutput_zero_point;
-    vout14 += voutput_zero_point;
-    vout15 += voutput_zero_point;
-    vout16 += voutput_zero_point;
-    vout17 += voutput_zero_point;
-    vout20 += voutput_zero_point;
-    vout21 += voutput_zero_point;
-    vout22 += voutput_zero_point;
-    vout23 += voutput_zero_point;
-    vout24 += voutput_zero_point;
-    vout25 += voutput_zero_point;
-    vout26 += voutput_zero_point;
-    vout27 += voutput_zero_point;
-    vout30 += voutput_zero_point;
-    vout31 += voutput_zero_point;
-    vout32 += voutput_zero_point;
-    vout33 += voutput_zero_point;
-    vout34 += voutput_zero_point;
-    vout35 += voutput_zero_point;
-    vout36 += voutput_zero_point;
-    vout37 += voutput_zero_point;
-    vout40 += voutput_zero_point;
-    vout41 += voutput_zero_point;
-    vout42 += voutput_zero_point;
-    vout43 += voutput_zero_point;
-    vout44 += voutput_zero_point;
-    vout45 += voutput_zero_point;
-    vout46 += voutput_zero_point;
-    vout47 += voutput_zero_point;
-    vout50 += voutput_zero_point;
-    vout51 += voutput_zero_point;
-    vout52 += voutput_zero_point;
-    vout53 += voutput_zero_point;
-    vout54 += voutput_zero_point;
-    vout55 += voutput_zero_point;
-    vout56 += voutput_zero_point;
-    vout57 += voutput_zero_point;
-    vout60 += voutput_zero_point;
-    vout61 += voutput_zero_point;
-    vout62 += voutput_zero_point;
-    vout63 += voutput_zero_point;
-    vout64 += voutput_zero_point;
-    vout65 += voutput_zero_point;
-    vout66 += voutput_zero_point;
-    vout67 += voutput_zero_point;
-    vout70 += voutput_zero_point;
-    vout71 += voutput_zero_point;
-    vout72 += voutput_zero_point;
-    vout73 += voutput_zero_point;
-    vout74 += voutput_zero_point;
-    vout75 += voutput_zero_point;
-    vout76 += voutput_zero_point;
-    vout77 += voutput_zero_point;
-
-    if XNN_LIKELY (nc >= 8) {
-      // Main case where there the 8 columns fit in the destination.
-      c0[0] = (int8_t) vout00;
-      c0[1] = (int8_t) vout01;
-      c0[2] = (int8_t) vout02;
-      c0[3] = (int8_t) vout03;
-      c0[4] = (int8_t) vout04;
-      c0[5] = (int8_t) vout05;
-      c0[6] = (int8_t) vout06;
-      c0[7] = (int8_t) vout07;
-      c1[0] = (int8_t) vout10;
-      c1[1] = (int8_t) vout11;
-      c1[2] = (int8_t) vout12;
-      c1[3] = (int8_t) vout13;
-      c1[4] = (int8_t) vout14;
-      c1[5] = (int8_t) vout15;
-      c1[6] = (int8_t) vout16;
-      c1[7] = (int8_t) vout17;
-      c2[0] = (int8_t) vout20;
-      c2[1] = (int8_t) vout21;
-      c2[2] = (int8_t) vout22;
-      c2[3] = (int8_t) vout23;
-      c2[4] = (int8_t) vout24;
-      c2[5] = (int8_t) vout25;
-      c2[6] = (int8_t) vout26;
-      c2[7] = (int8_t) vout27;
-      c3[0] = (int8_t) vout30;
-      c3[1] = (int8_t) vout31;
-      c3[2] = (int8_t) vout32;
-      c3[3] = (int8_t) vout33;
-      c3[4] = (int8_t) vout34;
-      c3[5] = (int8_t) vout35;
-      c3[6] = (int8_t) vout36;
-      c3[7] = (int8_t) vout37;
-      c4[0] = (int8_t) vout40;
-      c4[1] = (int8_t) vout41;
-      c4[2] = (int8_t) vout42;
-      c4[3] = (int8_t) vout43;
-      c4[4] = (int8_t) vout44;
-      c4[5] = (int8_t) vout45;
-      c4[6] = (int8_t) vout46;
-      c4[7] = (int8_t) vout47;
-      c5[0] = (int8_t) vout50;
-      c5[1] = (int8_t) vout51;
-      c5[2] = (int8_t) vout52;
-      c5[3] = (int8_t) vout53;
-      c5[4] = (int8_t) vout54;
-      c5[5] = (int8_t) vout55;
-      c5[6] = (int8_t) vout56;
-      c5[7] = (int8_t) vout57;
-      c6[0] = (int8_t) vout60;
-      c6[1] = (int8_t) vout61;
-      c6[2] = (int8_t) vout62;
-      c6[3] = (int8_t) vout63;
-      c6[4] = (int8_t) vout64;
-      c6[5] = (int8_t) vout65;
-      c6[6] = (int8_t) vout66;
-      c6[7] = (int8_t) vout67;
-      c7[0] = (int8_t) vout70;
-      c7[1] = (int8_t) vout71;
-      c7[2] = (int8_t) vout72;
-      c7[3] = (int8_t) vout73;
-      c7[4] = (int8_t) vout74;
-      c7[5] = (int8_t) vout75;
-      c7[6] = (int8_t) vout76;
-      c7[7] = (int8_t) vout77;
-
-      // Advance to the next 8 columns.
-      c0 = (int8_t*)((uintptr_t)c0 + cn_stride);
-      c1 = (int8_t*)((uintptr_t)c1 + cn_stride);
-      c2 = (int8_t*)((uintptr_t)c2 + cn_stride);
-      c3 = (int8_t*)((uintptr_t)c3 + cn_stride);
-      c4 = (int8_t*)((uintptr_t)c4 + cn_stride);
-      c5 = (int8_t*)((uintptr_t)c5 + cn_stride);
-      c6 = (int8_t*)((uintptr_t)c6 + cn_stride);
-      c7 = (int8_t*)((uintptr_t)c7 + cn_stride);
-
-      nc -= 8;
-    } else {
-      // Final case where not all of the 8 columns fit in the destination.
-      if (nc > 0) {
-        c0[0] = vout00;
-        c1[0] = vout10;
-        c2[0] = vout20;
-        c3[0] = vout30;
-        c4[0] = vout40;
-        c5[0] = vout50;
-        c6[0] = vout60;
-        c7[0] = vout70;
-      }
-      if (nc > 1) {
-        c0[1] = vout01;
-        c1[1] = vout11;
-        c2[1] = vout21;
-        c3[1] = vout31;
-        c4[1] = vout41;
-        c5[1] = vout51;
-        c6[1] = vout61;
-        c7[1] = vout71;
-      }
-      if (nc > 2) {
-        c0[2] = vout02;
-        c1[2] = vout12;
-        c2[2] = vout22;
-        c3[2] = vout32;
-        c4[2] = vout42;
-        c5[2] = vout52;
-        c6[2] = vout62;
-        c7[2] = vout72;
-      }
-      if (nc > 3) {
-        c0[3] = vout03;
-        c1[3] = vout13;
-        c2[3] = vout23;
-        c3[3] = vout33;
-        c4[3] = vout43;
-        c5[3] = vout53;
-        c6[3] = vout63;
-        c7[3] = vout73;
-      }
-      if (nc > 4) {
-        c0[4] = vout04;
-        c1[4] = vout14;
-        c2[4] = vout24;
-        c3[4] = vout34;
-        c4[4] = vout44;
-        c5[4] = vout54;
-        c6[4] = vout64;
-        c7[4] = vout74;
-      }
-      if (nc > 5) {
-        c0[5] = vout05;
-        c1[5] = vout15;
-        c2[5] = vout25;
-        c3[5] = vout35;
-        c4[5] = vout45;
-        c5[5] = vout55;
-        c6[5] = vout65;
-        c7[5] = vout75;
-      }
-      if (nc > 6) {
-        c0[6] = vout06;
-        c1[6] = vout16;
-        c2[6] = vout26;
-        c3[6] = vout36;
-        c4[6] = vout46;
-        c5[6] = vout56;
-        c6[6] = vout66;
-        c7[6] = vout76;
-      }
-      if (nc > 7) {
-        c0[7] = vout07;
-        c1[7] = vout17;
-        c2[7] = vout27;
-        c3[7] = vout37;
-        c4[7] = vout47;
-        c5[7] = vout57;
-        c6[7] = vout67;
-        c7[7] = vout77;
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/qs8-gemm/neon-mull-addw-dup.c.in b/src/qs8-gemm/neon-mull-addw-dup.c.in
new file mode 100644
index 0000000..3d38ef2
--- /dev/null
+++ b/src/qs8-gemm/neon-mull-addw-dup.c.in
@@ -0,0 +1,311 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+$assert NR % 8 == 0
+$assert 8 <= NR <= 16
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/gemm.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_${MR}x${NR}__neon_mull_addw_dup(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= ${MR});
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  $for M in range(1, MR):
+    const int8_t* a${M} = (const int8_t*) ((uintptr_t) a${M-1} + a_stride);
+    int8_t* c${M} = (int8_t*) ((uintptr_t) c${M-1} + cm_stride);
+    $if M % 2 == 0:
+      if XNN_UNPREDICTABLE(mr <= ${M}) {
+        a${M} = a${M-1};
+        c${M} = c${M-1};
+      }
+    $elif M + 1 == MR:
+      if XNN_UNPREDICTABLE(mr != ${M+1}) {
+        a${M} = a${M-1};
+        c${M} = c${M-1};
+      }
+    $else:
+      if XNN_UNPREDICTABLE(mr < ${M+1}) {
+        a${M} = a${M-1};
+        c${M} = c${M-1};
+      }
+
+  do {
+    $for N in range(0, NR, 4):
+      int32x4_t vacc0x${ABC[N:N+4]} = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    $for M in range(1, MR):
+      $for N in range(0, NR, 4):
+        int32x4_t vacc${M}x${ABC[N:N+4]} = vacc0x${ABC[N:N+4]};
+
+    size_t k = kc;
+    while (k >= 8 * sizeof(int8_t)) {
+      $for M in range(MR):
+        const int8x8_t va${M} = vld1_s8(a${M}); a${M} += 8;
+
+      $for K in range(8):
+        $for N in range(0, NR, 8):
+          const int8x8_t vb${ABC[N:N+8]}c${K} = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          $for M in range(MR):
+            const int16x8_t vprod${M}x${ABC[N:N+8]}c${K} = vmull_s8(vb${ABC[N:N+8]}c${K}, vdup_lane_s8(va${M}, ${K}));
+            vacc${M}x${ABC[N:N+4]} = vaddw_s16(vacc${M}x${ABC[N:N+4]}, vget_low_s16(vprod${M}x${ABC[N:N+8]}c${K}));
+            vacc${M}x${ABC[N+4:N+8]} = vaddw_s16(vacc${M}x${ABC[N+4:N+8]}, vget_high_s16(vprod${M}x${ABC[N:N+8]}c${K}));
+
+      k -= 8 * sizeof(int8_t);
+    }
+    if XNN_UNLIKELY(k != 0) {
+      $for M in range(MR):
+        const int8x8_t va${M} = vld1_s8(a${M}); a${M} = (const int8_t*) ((uintptr_t) a${M} + k);
+
+      $for N in range(0, NR, 8):
+        const int8x8_t vb${ABC[N:N+8]}c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      $for M in range(MR):
+        $for N in range(0, NR, 8):
+          const int16x8_t vprod${M}x${ABC[N:N+8]}c0 = vmull_s8(vb${ABC[N:N+8]}c0, vdup_lane_s8(va${M}, 0));
+          vacc${M}x${ABC[N:N+4]} = vaddw_s16(vacc${M}x${ABC[N:N+4]}, vget_low_s16(vprod${M}x${ABC[N:N+8]}c0));
+          vacc${M}x${ABC[N+4:N+8]} = vaddw_s16(vacc${M}x${ABC[N+4:N+8]}, vget_high_s16(vprod${M}x${ABC[N:N+8]}c0));
+
+      if (k >= 2 * sizeof(int8_t)) {
+        $for N in range(0, NR, 8):
+          const int8x8_t vb${ABC[N:N+8]}c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        $for M in range(MR):
+          $for N in range(0, NR, 8):
+            const int16x8_t vprod${M}x${ABC[N:N+8]}c1 = vmull_s8(vb${ABC[N:N+8]}c1, vdup_lane_s8(va${M}, 1));
+            vacc${M}x${ABC[N:N+4]} = vaddw_s16(vacc${M}x${ABC[N:N+4]}, vget_low_s16(vprod${M}x${ABC[N:N+8]}c1));
+            vacc${M}x${ABC[N+4:N+8]} = vaddw_s16(vacc${M}x${ABC[N+4:N+8]}, vget_high_s16(vprod${M}x${ABC[N:N+8]}c1));
+
+        if (k > 2 * sizeof(int8_t)) {
+          $for N in range(0, NR, 8):
+            const int8x8_t vb${ABC[N:N+8]}c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          $for M in range(MR):
+            $for N in range(0, NR, 8):
+              const int16x8_t vprod${M}x${ABC[N:N+8]}c2 = vmull_s8(vb${ABC[N:N+8]}c2, vdup_lane_s8(va${M}, 2));
+              vacc${M}x${ABC[N:N+4]} = vaddw_s16(vacc${M}x${ABC[N:N+4]}, vget_low_s16(vprod${M}x${ABC[N:N+8]}c2));
+              vacc${M}x${ABC[N+4:N+8]} = vaddw_s16(vacc${M}x${ABC[N+4:N+8]}, vget_high_s16(vprod${M}x${ABC[N:N+8]}c2));
+
+          if (k >= 4 * sizeof(int8_t)) {
+            $for N in range(0, NR, 8):
+              const int8x8_t vb${ABC[N:N+8]}c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+            $for M in range(MR):
+              $for N in range(0, NR, 8):
+                const int16x8_t vprod${M}x${ABC[N:N+8]}c3 = vmull_s8(vb${ABC[N:N+8]}c3, vdup_lane_s8(va${M}, 3));
+                vacc${M}x${ABC[N:N+4]} = vaddw_s16(vacc${M}x${ABC[N:N+4]}, vget_low_s16(vprod${M}x${ABC[N:N+8]}c3));
+                vacc${M}x${ABC[N+4:N+8]} = vaddw_s16(vacc${M}x${ABC[N+4:N+8]}, vget_high_s16(vprod${M}x${ABC[N:N+8]}c3));
+
+            if (k > 4 * sizeof(int8_t)) {
+              $for N in range(0, NR, 8):
+                const int8x8_t vb${ABC[N:N+8]}c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+              $for M in range(MR):
+                $for N in range(0, NR, 8):
+                  const int16x8_t vprod${M}x${ABC[N:N+8]}c4 = vmull_s8(vb${ABC[N:N+8]}c4, vdup_lane_s8(va${M}, 4));
+                  vacc${M}x${ABC[N:N+4]} = vaddw_s16(vacc${M}x${ABC[N:N+4]}, vget_low_s16(vprod${M}x${ABC[N:N+8]}c4));
+                  vacc${M}x${ABC[N+4:N+8]} = vaddw_s16(vacc${M}x${ABC[N+4:N+8]}, vget_high_s16(vprod${M}x${ABC[N:N+8]}c4));
+
+              if (k >= 6 * sizeof(int8_t)) {
+                $for N in range(0, NR, 8):
+                  const int8x8_t vb${ABC[N:N+8]}c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+                $for M in range(MR):
+                  $for N in range(0, NR, 8):
+                    const int16x8_t vprod${M}x${ABC[N:N+8]}c5 = vmull_s8(vb${ABC[N:N+8]}c5, vdup_lane_s8(va${M}, 5));
+                    vacc${M}x${ABC[N:N+4]} = vaddw_s16(vacc${M}x${ABC[N:N+4]}, vget_low_s16(vprod${M}x${ABC[N:N+8]}c5));
+                    vacc${M}x${ABC[N+4:N+8]} = vaddw_s16(vacc${M}x${ABC[N+4:N+8]}, vget_high_s16(vprod${M}x${ABC[N:N+8]}c5));
+
+                if (k > 6 * sizeof(int8_t)) {
+                  $for N in range(0, NR, 8):
+                    const int8x8_t vb${ABC[N:N+8]}c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+                  $for M in range(MR):
+                    $for N in range(0, NR, 8):
+                      const int16x8_t vprod${M}x${ABC[N:N+8]}c6 = vmull_s8(vb${ABC[N:N+8]}c6, vdup_lane_s8(va${M}, 6));
+                      vacc${M}x${ABC[N:N+4]} = vaddw_s16(vacc${M}x${ABC[N:N+4]}, vget_low_s16(vprod${M}x${ABC[N:N+8]}c6));
+                      vacc${M}x${ABC[N+4:N+8]} = vaddw_s16(vacc${M}x${ABC[N+4:N+8]}, vget_high_s16(vprod${M}x${ABC[N:N+8]}c6));
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    $for M in range(MR):
+      $for N in range(0, NR, 4):
+        vacc${M}x${ABC[N:N+4]} = vqrdmulhq_s32(vacc${M}x${ABC[N:N+4]}, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    $for M in range(MR):
+      $for N in range(0, NR, 4):
+        vacc${M}x${ABC[N:N+4]} = vsraq_n_s32(vacc${M}x${ABC[N:N+4]}, vbicq_s32(vacc${M}x${ABC[N:N+4]}, vzero_shift_mask), 31);
+
+    $for M in range(MR):
+      $for N in range(0, NR, 4):
+        vacc${M}x${ABC[N:N+4]} = vrshlq_s32(vacc${M}x${ABC[N:N+4]}, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    $for M in range(MR):
+      $for N in range(0, NR, 8):
+        const int16x8_t vacc${M}x${ABC[N:N+8]} = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc${M}x${ABC[N:N+4]}), vacc${M}x${ABC[N+4:N+8]}), voutput_zero_point);
+
+    $for M in range(MR):
+      $for N in range(0, NR, 16):
+        $if N + 8 < NR:
+          int8x16_t vout${M}x${ABC[N:N+16]} = vqmovn_high_s16(vqmovn_s16(vacc${M}x${ABC[N:N+8]}), vacc${M}x${ABC[N+8:N+16]});
+        $elif M % 2 == 1:
+          int8x16_t vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vqmovn_high_s16(vqmovn_s16(vacc${M-1}x${ABC[N:N+8]}), vacc${M}x${ABC[N:N+8]});
+        $elif M + 1 == MR:
+          int8x8_t vout${M}x${ABC[N:N+8]} = vqmovn_s16(vacc${M}x${ABC[N:N+8]});
+#else
+    $for M in range(MR):
+      $for N in range(0, NR, 8):
+        const int16x8_t vacc${M}x${ABC[N:N+8]} = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc${M}x${ABC[N:N+4]}), vqmovn_s32(vacc${M}x${ABC[N+4:N+8]})), voutput_zero_point);
+
+    $for M in range(MR):
+      $for N in range(0, NR, 16):
+        $if N + 8 < NR:
+          int8x16_t vout${M}x${ABC[N:N+16]} = vcombine_s8(vqmovn_s16(vacc${M}x${ABC[N:N+8]}), vqmovn_s16(vacc${M}x${ABC[N+8:N+16]}));
+        $elif M % 2 == 1:
+          int8x16_t vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vcombine_s8(vqmovn_s16(vacc${M-1}x${ABC[N:N+8]}), vqmovn_s16(vacc${M}x${ABC[N:N+8]}));
+        $elif M + 1 == MR:
+          int8x8_t vout${M}x${ABC[N:N+8]} = vqmovn_s16(vacc${M}x${ABC[N:N+8]});
+#endif
+    $if NR == 8 and MR == 1:
+      const int8x8_t voutput_min = vld1_dup_s8(&params->neon.output_min);
+      const int8x8_t voutput_max = vld1_dup_s8(&params->neon.output_max);
+    $else:
+      const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+      const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    $for M in range(MR):
+      $for N in range(0, NR, 16):
+        $if N + 8 < NR:
+          vout${M}x${ABC[N:N+16]} = vmaxq_s8(vout${M}x${ABC[N:N+16]}, voutput_min);
+        $elif M % 2 == 1:
+          vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vmaxq_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}, voutput_min);
+        $elif M + 1 == MR:
+          $if NR == 8 and MR == 1:
+            vout${M}x${ABC[N:N+8]} = vmax_s8(vout${M}x${ABC[N:N+8]}, voutput_min);
+          $else:
+            vout${M}x${ABC[N:N+8]} = vmax_s8(vout${M}x${ABC[N:N+8]}, vget_low_s8(voutput_min));
+
+    $for M in range(MR):
+      $for N in range(0, NR, 16):
+        $if N + 8 < NR:
+          vout${M}x${ABC[N:N+16]} = vminq_s8(vout${M}x${ABC[N:N+16]}, voutput_max);
+        $elif M % 2 == 1:
+          vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vminq_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}, voutput_max);
+        $elif M + 1 == MR:
+          $if NR == 8 and MR == 1:
+            vout${M}x${ABC[N:N+8]} = vmin_s8(vout${M}x${ABC[N:N+8]}, voutput_max);
+          $else:
+            vout${M}x${ABC[N:N+8]} = vmin_s8(vout${M}x${ABC[N:N+8]}, vget_low_s8(voutput_max));
+
+    if (nc >= ${NR}) {
+      $for M in range(MR):
+        $for N in range(0, NR, 16):
+          $if N + 8 < NR:
+            vst1q_s8(c${M} + ${N}, vout${M}x${ABC[N:N+16]});
+          $elif M % 2 == 1:
+            vst1_s8(c${M-1} + ${N}, vget_low_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}));
+            vst1_s8(c${M} + ${N}, vget_high_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}));
+          $elif M + 1 == MR:
+            vst1_s8(c${M} + ${N}, vout${M}x${ABC[N:N+8]});
+
+      $for M in range(MR):
+        c${M} = (int8_t*) ((uintptr_t) c${M} + cn_stride);
+
+      $for M in range(MR):
+        a${M} = (const int8_t*) ((uintptr_t) a${M} - kc);
+
+      nc -= ${NR};
+    } else {
+      $if NR == 16:
+        $for M in range(MR):
+          $if M % 2 == 1:
+            int8x16_t vout${M-1}x01234567_${M}x01234567 = vcombine_s8(vget_low_s8(vout${M-1}x0123456789ABCDEF), vget_low_s8(vout${M}x0123456789ABCDEF));
+          $elif M + 1 == MR:
+            int8x8_t vout${M}x01234567 = vget_low_s8(vout${M}x0123456789ABCDEF);
+        if (nc & 8) {
+          $for M in range(MR):
+            $if M % 2 == 1:
+              vst1_s8(c${M-1}, vget_low_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]})); c${M-1} += 8;
+              vst1_s8(c${M}, vget_high_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]})); c${M} += 8;
+            $elif M + 1 == MR:
+              vst1_s8(c${M}, vout${M}x${ABC[N:N+8]}); c${M} += 8;
+          $for M in range(MR):
+            $if M % 2 == 1:
+              vout${M-1}x01234567_${M}x01234567 = vcombine_s8(vget_high_s8(vout${M-1}x0123456789ABCDEF), vget_high_s8(vout${M}x0123456789ABCDEF));
+            $elif M + 1 == MR:
+              vout${M}x01234567 = vget_high_s8(vout${M}x0123456789ABCDEF);
+        }
+      if (nc & 4) {
+        $for M in range(MR):
+          $if M % 2 == 1:
+            vst1q_lane_u32(__builtin_assume_aligned(c${M-1}, 1), vreinterpretq_u32_s8(vout${M-1}x01234567_${M}x01234567), 0); c${M-1} += 4;
+            vst1q_lane_u32(__builtin_assume_aligned(c${M}, 1), vreinterpretq_u32_s8(vout${M-1}x01234567_${M}x01234567), 2); c${M} += 4;
+          $elif M + 1 == MR:
+            vst1_lane_u32(__builtin_assume_aligned(c${M}, 1), vreinterpret_u32_s8(vout${M}x01234567), 0); c${M} += 4;
+        $for M in range(MR):
+          $if M % 2 == 1:
+            vout${M-1}x01234567_${M}x01234567 = vextq_s8(vout${M-1}x01234567_${M}x01234567, vout${M-1}x01234567_${M}x01234567, 4);
+          $elif M + 1 == MR:
+            vout${M}x01234567 = vext_s8(vout${M}x01234567, vout${M}x01234567, 4);
+      }
+      if (nc & 2) {
+        $for M in range(MR):
+          $if M % 2 == 1:
+            vst1q_lane_u16(__builtin_assume_aligned(c${M-1}, 1), vreinterpretq_u16_s8(vout${M-1}x01234567_${M}x01234567), 0); c${M-1} += 2;
+            vst1q_lane_u16(__builtin_assume_aligned(c${M}, 1), vreinterpretq_u16_s8(vout${M-1}x01234567_${M}x01234567), 4); c${M} += 2;
+          $elif M + 1 == MR:
+            vst1_lane_u16(__builtin_assume_aligned(c${M}, 1), vreinterpret_u16_s8(vout${M}x01234567), 0); c${M} += 2;
+        $for M in range(MR):
+          $if M % 2 == 1:
+            vout${M-1}x01234567_${M}x01234567 = vextq_s8(vout${M-1}x01234567_${M}x01234567, vout${M-1}x01234567_${M}x01234567, 2);
+          $elif M + 1 == MR:
+            vout${M}x01234567 = vext_s8(vout${M}x01234567, vout${M}x01234567, 2);
+      }
+      if (nc & 1) {
+        $for M in range(MR):
+          $if M % 2 == 1:
+            vst1q_lane_s8(c${M-1}, vout${M-1}x01234567_${M}x01234567, 0);
+            vst1q_lane_s8(c${M}, vout${M-1}x01234567_${M}x01234567, 8);
+          $elif M + 1 == MR:
+            vst1_lane_s8(c${M}, vout${M}x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/2x8c16-aarch64-neon-mlal-padal.S b/src/qs8-igemm/2x8c16-aarch64-neon-mlal-padal.S
new file mode 100644
index 0000000..5771fc8
--- /dev/null
+++ b/src/qs8-igemm/2x8c16-aarch64-neon-mlal-padal.S
@@ -0,0 +1,239 @@
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack/assembly.h>
+
+# void xnn_qs8_igemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal(
+#     size_t mr,                 x0
+#     size_t nc,                 x1
+#     size_t kc,                 x2 / x0
+#     size_t ks,                 x3 / x9
+#     const int8_t**restrict a,  x4
+#     const int8_t* restrict w,  x5
+#     int8_t* restrict c,        x6
+#     size_t cm_stride,          x7
+#     size_t cn_stride,                  [sp] -> x10
+#     size_t a_offset,                   [sp + 8] -> x11
+#     const float* zero,                 [sp + 16] -> x12
+#     const xnn_f32_minmax_params params [sp + 24] -> x8
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0 x13  v0
+# A1 x15  v1
+# B   x5  v4  v5  v6  v7
+# C0  x7 v16 v18 v20 v22 v24 v26 v28 v30
+# C1  x8 v17 v19 v21 v23 v25 v27 v29 v31
+# temp0   v2 v10 v12 v14
+# temp1   v3 v11 v13 v15
+# unused  v8 v9
+
+BEGIN_FUNCTION xnn_qs8_igemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal
+
+        # Clamp C pointers
+        LDP     x10, x11, [sp]        // Load cn_stride, a_offset
+        CMP     x0, 2                 // if mr < 2
+        LDP     x12, x8, [sp, 16]     // Load zero, params pointer
+        ADD     x7, x6, x7            // c1 = c0 + cm_stride
+        STP     d10, d11, [sp, -48]!
+        ADD     x2, x2, 15            // kc = (kc + 15) & ~15
+        STP     d12, d13, [sp, 16]
+        CSEL    x7, x6, x7, LO        //   c1 = c0
+        STP     d14, d15, [sp, 32]
+        BIC     x2, x2, 15
+
+        .p2align 3
+0:
+        # Load initial bias from w into accumulators
+        LDP     s16, s18, [x5], 8
+        MOV     v17.4s, v16.4s
+        MOV     v19.4s, v18.4s
+        LDP     s20, s22, [x5], 8
+        MOV     v21.4s, v20.4s
+        MOV     v23.4s, v22.4s
+        LDP     s24, s26, [x5], 8
+        MOV     v25.4s, v24.4s
+        MOV     v27.4s, v26.4s
+        LDP     s28, s30, [x5], 8
+        MOV     v29.4s, v28.4s
+        MOV     v31.4s, v30.4s
+        MOV     x9, x3  // p = ks
+
+        .p2align 3
+1:
+        # Load next 2 A pointers
+        LDP     x13, x15, [x4], 16
+
+        CMP     x13, x12           // if a0 == zero
+        ADD     x13, x13, x11      // a0 += a_offset
+        CSEL    x13, x12, x13, EQ  //   a0 = zero, else += a0 + a_offset
+        CMP     x15, x12           // if a1 == zero
+        ADD     x15, x15, x11      // a1 += a_offset
+        CSEL    x15, x12, x15, EQ  //   a1 = zero, else += a1 + a_offset
+
+        MOV     x0, x2             // k = kc
+
+        # Main loop - 16 bytes of A
+        .p2align 3
+2:
+        LDR     q0, [x13], 16
+        LDP     q4, q5, [x5]
+        LDR     q1, [x15], 16
+        LDP     q6, q7, [x5, 32]
+        SMULL    v2.8h, v4.8b, v0.8b
+        SMULL    v3.8h, v4.8b, v1.8b
+        SMULL   v10.8h, v5.8b, v0.8b
+        SMULL   v11.8h, v5.8b, v1.8b
+        SMLAL2   v2.8h, v4.16b, v0.16b
+        SMLAL2   v3.8h, v4.16b, v1.16b
+        SMLAL2  v10.8h, v5.16b, v0.16b
+        SMLAL2  v11.8h, v5.16b, v1.16b
+        SMULL   v12.8h, v6.8b, v0.8b
+        SADALP  v16.4s,  v2.8h
+        SMULL   v13.8h, v6.8b, v1.8b
+        SADALP  v17.4s,  v3.8h
+        SMULL   v14.8h, v7.8b, v0.8b
+        SADALP  v18.4s, v10.8h
+        SMULL   v15.8h, v7.8b, v1.8b
+        SADALP  v19.4s, v11.8h
+        LDP     q4, q5, [x5, 64]
+        SMLAL2  v12.8h, v6.16b, v0.16b
+        SMLAL2  v13.8h, v6.16b, v1.16b
+        SMLAL2  v14.8h, v7.16b, v0.16b
+        SMLAL2  v15.8h, v7.16b, v1.16b
+        SMULL    v2.8h, v4.8b, v0.8b
+        SADALP  v20.4s, v12.8h
+        SMULL    v3.8h, v4.8b, v1.8b
+        SADALP  v21.4s, v13.8h
+        SMULL   v10.8h, v5.8b, v0.8b
+        SADALP  v22.4s, v14.8h
+        SMULL   v11.8h, v5.8b, v1.8b
+        SADALP  v23.4s, v15.8h
+        LDP     q6, q7, [x5, 96]
+
+        SMLAL2   v2.8h, v4.16b, v0.16b
+        SMLAL2   v3.8h, v4.16b, v1.16b
+        SMLAL2  v10.8h, v5.16b, v0.16b
+        SMLAL2  v11.8h, v5.16b, v1.16b
+        ADD     x5, x5, 128
+        SMULL   v12.8h, v6.8b, v0.8b
+        SADALP  v24.4s,  v2.8h
+        SMULL   v13.8h, v6.8b, v1.8b
+        SADALP  v25.4s,  v3.8h
+        SMULL   v14.8h, v7.8b, v0.8b
+        SADALP  v26.4s, v10.8h
+        SMULL   v15.8h, v7.8b, v1.8b
+        SADALP  v27.4s, v11.8h
+        SUBS    x0, x0, 16
+        SMLAL2  v12.8h, v6.16b, v0.16b
+        SMLAL2  v13.8h, v6.16b, v1.16b
+        SMLAL2  v14.8h, v7.16b, v0.16b
+        SMLAL2  v15.8h, v7.16b, v1.16b
+        SADALP  v28.4s, v12.8h
+        SADALP  v29.4s, v13.8h
+        SADALP  v30.4s, v14.8h
+        SADALP  v31.4s, v15.8h
+        B.HI    2b
+
+        # ks loop
+        SUBS    x9, x9, 16  // ks -= MR * sizeof(int8_t*)
+        B.HI    1b
+
+        # Add columns
+        ADDP    v16.4s, v16.4s, v18.4s
+        ADDP    v20.4s, v20.4s, v22.4s
+        LD1R    {v4.4s}, [x8], 4
+        ADDP    v24.4s, v24.4s, v26.4s
+        ADDP    v28.4s, v28.4s, v30.4s
+        LD1R    {v7.4s}, [x8], 4
+        ADDP    v17.4s, v17.4s, v19.4s
+        ADDP    v21.4s, v21.4s, v23.4s
+        ADDP    v25.4s, v25.4s, v27.4s
+        ADDP    v29.4s, v29.4s, v31.4s
+        ADDP    v0.4s, v16.4s, v20.4s
+        ADDP    v1.4s, v24.4s, v28.4s
+        ADDP    v2.4s, v17.4s, v21.4s
+        ADDP    v3.4s, v25.4s, v29.4s
+
+        # Apply params - scale, shift, bias and clamp
+        SQRDMULH        v0.4s, v0.4s, v4.4s
+        SQRDMULH        v1.4s, v1.4s, v4.4s
+        SQRDMULH        v2.4s, v2.4s, v4.4s
+        SQRDMULH        v3.4s, v3.4s, v4.4s
+        CMEQ    v4.4s, v7.4s, 0
+        LD1R    {v5.8h}, [x8], 2
+        BIC      v6.16b, v0.16b, v4.16b
+        BIC     v16.16b, v1.16b, v4.16b
+        BIC     v17.16b, v2.16b, v4.16b
+        BIC     v4.16b,  v3.16b, v4.16b
+        SSRA    v0.4s,  v6.4s, 31
+        SSRA    v1.4s, v16.4s, 31
+        SSRA    v2.4s, v17.4s, 31
+        SSRA    v3.4s,  v4.4s, 31
+        SRSHL   v0.4s, v0.4s, v7.4s
+        SRSHL   v1.4s, v1.4s, v7.4s
+        SRSHL   v2.4s, v2.4s, v7.4s
+        SRSHL   v3.4s, v3.4s, v7.4s
+        SQXTN   v0.4h, v0.4s
+        SQXTN   v2.4h, v2.4s
+        SQXTN2  v0.8h, v1.4s
+        SQXTN2  v2.8h, v3.4s
+        SUBS    x1, x1, 8
+        SQADD   v0.8h, v0.8h, v5.8h
+        SQADD   v1.8h, v2.8h, v5.8h
+        SQXTN   v0.8b, v0.8h
+        SQXTN2  v0.16b, v1.8h
+        LD1R    {v1.16b}, [x8], 1
+        LD1R    {v2.16b}, [x8]
+        SMAX    v0.16b, v0.16b, v1.16b
+        SUB     x8, x8, 11       // rewind params pointer
+        SMIN    v0.16b, v0.16b, v2.16b
+        B.LO    3f
+
+        # Store full 2 x 8
+        ST1     {v0.d}[1], [x7], x10
+        SUB     x4, x4, x3  // a -= ks
+        ST1     {v0.8b}, [x6], x10
+
+        # nc loop
+        B.HI    0b
+
+        # Restore d10-d15 from stack
+        LDP     d14, d15, [sp, 32]
+        LDP     d12, d13, [sp, 16]
+        LDP     d10, d11, [sp], 48
+        RET
+
+        # Store odd width
+        .p2align 3
+3:
+        TBZ     x1, 2, 4f
+        ST1     {v0.s}[2], [x7], 4
+        STR     s0, [x6], 4
+        EXT     v0.16b, v0.16b, v0.16b, 4
+
+4:
+        TBZ     x1, 1, 5f
+        ST1     {v0.h}[4], [x7], 2
+        ST1     {v0.h}[0], [x6], 2
+        EXT     v0.16b, v0.16b, v0.16b, 2
+5:
+        TBZ     x1, 0, 6f
+        ST1     {v0.b}[8], [x7]
+        ST1     {v0.b}[0], [x6]
+6:
+        # Restore d10-d15 from stack
+        LDP     d14, d15, [sp, 32]
+        LDP     d12, d13, [sp, 16]
+        LDP     d10, d11, [sp], 48
+        RET
+
+END_FUNCTION xnn_qs8_igemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
+
diff --git a/src/qs8-igemm/2x8c8-aarch64-neon-mlal-padal.S b/src/qs8-igemm/2x8c8-aarch64-neon-mlal-padal.S
new file mode 100644
index 0000000..273021b
--- /dev/null
+++ b/src/qs8-igemm/2x8c8-aarch64-neon-mlal-padal.S
@@ -0,0 +1,302 @@
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack/assembly.h>
+
+# void xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal(
+#     size_t mr,                 x0
+#     size_t nc,                 x1
+#     size_t kc,                 x2 / x0
+#     size_t ks,                 x3 / x9
+#     const int8_t**restrict a,  x4
+#     const int8_t* restrict w,  x5
+#     int8_t* restrict c,        x6
+#     size_t cm_stride,          x7
+#     size_t cn_stride,                  [sp] -> x10
+#     size_t a_offset,                   [sp + 8] -> x11
+#     const float* zero,                 [sp + 16] -> x12
+#     const xnn_f32_minmax_params params [sp + 24] -> x8
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0 x13  v0
+# A1 x15  v1
+# B   x5  v4  v5  v6  v7
+# C0  x7 v16 v18 v20 v22 v24 v26 v28 v30
+# C1  x8 v17 v19 v21 v23 v25 v27 v29 v31
+# temp0   v2 v10 v12 v14
+# temp1   v3 v11 v13 v15
+
+BEGIN_FUNCTION xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal
+
+        # Clamp C pointers
+        LDP     x10, x11, [sp]        // Load cn_stride, a_offset
+        CMP     x0, 2                 // if mr < 2
+        LDP     x12, x8, [sp, 16]     // Load zero, params pointer
+        ADD     x7, x6, x7            // c1 = c0 + cm_stride
+        STP     d8, d9, [sp, -64]!
+        ADD     x2, x2, 7             // kc = (kc + 7) & ~7
+        STP     d10, d11, [sp, 16]
+        CSEL    x7, x6, x7, LO        //   c1 = c0
+        STP     d12, d13, [sp, 32]
+        BIC     x2, x2, 7
+        STP     d14, d15, [sp, 48]
+
+        .p2align 3
+0:
+        # Load initial bias from w into accumulators
+        LDP     s16, s18, [x5], 8
+        MOV     v17.4s, v16.4s
+        MOV     v19.4s, v18.4s
+        LDP     s20, s22, [x5], 8
+        MOV     v21.4s, v20.4s
+        MOV     v23.4s, v22.4s
+        LDP     s24, s26, [x5], 8
+        MOV     v25.4s, v24.4s
+        MOV     v27.4s, v26.4s
+        LDP     s28, s30, [x5], 8
+        MOV     v29.4s, v28.4s
+        MOV     v31.4s, v30.4s
+        MOV     x9, x3  // p = ks
+
+        .p2align 3
+1:
+        # Load next 2 A pointers
+        LDP     x13, x15, [x4], 16
+
+        CMP     x13, x12           // if a0 == zero
+        ADD     x13, x13, x11      // a0 += a_offset
+        CSEL    x13, x12, x13, EQ  //   a0 = zero, else += a0 + a_offset
+        CMP     x15, x12           // if a1 == zero
+        ADD     x15, x15, x11      // a1 += a_offset
+        CSEL    x15, x12, x15, EQ  //   a1 = zero, else += a1 + a_offset
+
+        # Is there at least 16 bytes for main loop?
+        SUBS    x0, x2, 16          // k = kc - 16
+        B.LO    4f
+
+         # Main loop - 16 bytes of A
+        .p2align 3
+2:
+        LDP     d0, d6, [x13], 16
+        LDP     d4, d5, [x5]
+        LDP     d1, d7, [x15], 16
+        LDP     d8, d9, [x5, 64]
+        SMULL    v2.8h, v4.8b, v0.8b
+        SMULL    v3.8h, v4.8b, v1.8b
+        SMULL   v10.8h, v5.8b, v0.8b
+        SMULL   v11.8h, v5.8b, v1.8b
+        LDP     d4, d5, [x5, 16]
+        SMLAL    v2.8h, v8.8b, v6.8b
+        SMLAL    v3.8h, v8.8b, v7.8b
+        SMLAL   v10.8h, v9.8b, v6.8b
+        SMLAL   v11.8h, v9.8b, v7.8b
+
+        LDP     d8, d9, [x5, 80]
+        SMULL   v12.8h, v4.8b, v0.8b
+        SADALP  v16.4s,  v2.8h
+        SMULL   v13.8h, v4.8b, v1.8b
+        SADALP  v17.4s,  v3.8h
+        SMULL   v14.8h, v5.8b, v0.8b
+        SADALP  v18.4s, v10.8h
+        SMULL   v15.8h, v5.8b, v1.8b
+        SADALP  v19.4s, v11.8h
+        LDP     d4, d5, [x5, 32]
+        SMLAL   v12.8h, v8.8b, v6.8b
+        SMLAL   v13.8h, v8.8b, v7.8b
+        SMLAL   v14.8h, v9.8b, v6.8b
+        SMLAL   v15.8h, v9.8b, v7.8b
+
+        LDP     d8, d9, [x5, 96]
+        SMULL    v2.8h, v4.8b, v0.8b
+        SADALP  v20.4s, v12.8h
+        SMULL    v3.8h, v4.8b, v1.8b
+        SADALP  v21.4s, v13.8h
+        SMULL   v10.8h, v5.8b, v0.8b
+        SADALP  v22.4s, v14.8h
+        SMULL   v11.8h, v5.8b, v1.8b
+        SADALP  v23.4s, v15.8h
+        LDP     d4, d5, [x5, 48]
+        SMLAL    v2.8h, v8.8b, v6.8b
+        SMLAL    v3.8h, v8.8b, v7.8b
+        SMLAL   v10.8h, v9.8b, v6.8b
+        SMLAL   v11.8h, v9.8b, v7.8b
+
+        LDP     d8, d9, [x5, 112]
+        SMULL   v12.8h, v4.8b, v0.8b
+        SADALP  v24.4s,  v2.8h
+        SMULL   v13.8h, v4.8b, v1.8b
+        SADALP  v25.4s,  v3.8h
+        SMULL   v14.8h, v5.8b, v0.8b
+        SADALP  v26.4s, v10.8h
+        SMULL   v15.8h, v5.8b, v1.8b
+        SADALP  v27.4s, v11.8h
+        SMLAL   v12.8h, v8.8b, v6.8b
+        SMLAL   v13.8h, v8.8b, v7.8b
+        SMLAL   v14.8h, v9.8b, v6.8b
+        SMLAL   v15.8h, v9.8b, v7.8b
+        ADD     x5, x5, 128
+
+        SADALP  v28.4s, v12.8h
+        SADALP  v29.4s, v13.8h
+        SUBS    x0, x0, 16
+        SADALP  v30.4s, v14.8h
+        SADALP  v31.4s, v15.8h
+        B.HS    2b
+
+        # Is there a remainder?- 8 bytes of A
+        TBNZ    x0, 3, 4f
+
+        # ks loop
+        SUBS    x9, x9, 16  // ks -= MR * sizeof(int8_t*)
+        B.HI    1b
+
+3:
+        # Add columns
+        ADDP    v16.4s, v16.4s, v18.4s
+        ADDP    v20.4s, v20.4s, v22.4s
+        LD1R    {v4.4s}, [x8], 4
+        ADDP    v24.4s, v24.4s, v26.4s
+        ADDP    v28.4s, v28.4s, v30.4s
+        LD1R    {v7.4s}, [x8], 4
+        ADDP    v17.4s, v17.4s, v19.4s
+        ADDP    v21.4s, v21.4s, v23.4s
+        ADDP    v25.4s, v25.4s, v27.4s
+        ADDP    v29.4s, v29.4s, v31.4s
+        ADDP    v0.4s, v16.4s, v20.4s
+        ADDP    v1.4s, v24.4s, v28.4s
+        ADDP    v2.4s, v17.4s, v21.4s
+        ADDP    v3.4s, v25.4s, v29.4s
+
+        # Apply params - scale, shift, bias and clamp
+        SQRDMULH        v0.4s, v0.4s, v4.4s
+        SQRDMULH        v1.4s, v1.4s, v4.4s
+        SQRDMULH        v2.4s, v2.4s, v4.4s
+        SQRDMULH        v3.4s, v3.4s, v4.4s
+        CMEQ    v4.4s, v7.4s, 0
+        LD1R    {v5.8h}, [x8], 2
+        BIC      v6.16b, v0.16b, v4.16b
+        BIC     v16.16b, v1.16b, v4.16b
+        BIC     v17.16b, v2.16b, v4.16b
+        BIC     v4.16b,  v3.16b, v4.16b
+        SSRA    v0.4s,  v6.4s, 31
+        SSRA    v1.4s, v16.4s, 31
+        SSRA    v2.4s, v17.4s, 31
+        SSRA    v3.4s,  v4.4s, 31
+        SRSHL   v0.4s, v0.4s, v7.4s
+        SRSHL   v1.4s, v1.4s, v7.4s
+        SRSHL   v2.4s, v2.4s, v7.4s
+        SRSHL   v3.4s, v3.4s, v7.4s
+        SQXTN   v0.4h, v0.4s
+        SQXTN   v2.4h, v2.4s
+        SQXTN2  v0.8h, v1.4s
+        SQXTN2  v2.8h, v3.4s
+        SUBS    x1, x1, 8
+        SQADD   v0.8h, v0.8h, v5.8h
+        SQADD   v1.8h, v2.8h, v5.8h
+        SQXTN   v0.8b, v0.8h
+        SQXTN2  v0.16b, v1.8h
+        LD1R    {v1.16b}, [x8], 1
+        LD1R    {v2.16b}, [x8]
+        SMAX    v0.16b, v0.16b, v1.16b
+        SUB     x8, x8, 11       // rewind params pointer
+        SMIN    v0.16b, v0.16b, v2.16b
+        B.LO    5f
+
+        # Store full 2 x 8
+        ST1     {v0.d}[1], [x7], x10
+        ST1     {v0.8b}, [x6], x10
+
+        SUB     x4, x4, x3  // a -= ks
+
+        # nc loop
+        B.HI    0b
+
+        # Restore d8-d15 from stack
+        LDP     d14, d15, [sp, 48]
+        LDP     d12, d13, [sp, 32]
+        LDP     d10, d11, [sp, 16]
+        LDP     d8, d9, [sp], 64
+        RET
+
+        # Remainder - 8 bytes of A
+        .p2align 3
+4:
+        LDR     d0, [x13]
+        LDP     d4, d5, [x5]
+        LDR     d1, [x15]
+        LDP     d6, d7, [x5, 16]
+        SMULL    v2.8h, v4.8b, v0.8b
+        SMULL    v3.8h, v4.8b, v1.8b
+        SMULL   v10.8h, v5.8b, v0.8b
+        SMULL   v11.8h, v5.8b, v1.8b
+        SMULL   v12.8h, v6.8b, v0.8b
+        SADALP  v16.4s,  v2.8h
+        SMULL   v13.8h, v6.8b, v1.8b
+        SADALP  v17.4s,  v3.8h
+        SMULL   v14.8h, v7.8b, v0.8b
+        SADALP  v18.4s, v10.8h
+        SMULL   v15.8h, v7.8b, v1.8b
+        SADALP  v19.4s, v11.8h
+        LDP     d4, d5, [x5, 32]
+        SMULL    v2.8h, v4.8b, v0.8b
+        SADALP  v20.4s, v12.8h
+        SMULL    v3.8h, v4.8b, v1.8b
+        SADALP  v21.4s, v13.8h
+        SMULL   v10.8h, v5.8b, v0.8b
+        SADALP  v22.4s, v14.8h
+        SMULL   v11.8h, v5.8b, v1.8b
+        SADALP  v23.4s, v15.8h
+        LDP     d6, d7, [x5, 48]
+        SMULL   v12.8h, v6.8b, v0.8b
+        SADALP  v24.4s,  v2.8h
+        SMULL   v13.8h, v6.8b, v1.8b
+        SADALP  v25.4s,  v3.8h
+        SMULL   v14.8h, v7.8b, v0.8b
+        SADALP  v26.4s, v10.8h
+        SMULL   v15.8h, v7.8b, v1.8b
+        SADALP  v27.4s, v11.8h
+        ADD     x5, x5, 64
+        SADALP  v28.4s, v12.8h
+        SADALP  v29.4s, v13.8h
+        SADALP  v30.4s, v14.8h
+        SADALP  v31.4s, v15.8h
+
+        # ks loop
+        SUBS    x9, x9, 16  // ks -= MR * sizeof(int8_t*)
+        B.HI    1b
+        B       3b
+
+        # Store odd width
+        .p2align 3
+5:
+        TBZ     x1, 2, 6f
+        ST1     {v0.s}[2], [x7], 4
+        STR     s0, [x6], 4
+        EXT     v0.16b, v0.16b, v0.16b, 4
+
+6:
+        TBZ     x1, 1, 7f
+        ST1     {v0.h}[4], [x7], 2
+        ST1     {v0.h}[0], [x6], 2
+        EXT     v0.16b, v0.16b, v0.16b, 2
+7:
+        TBZ     x1, 0, 8f
+        ST1     {v0.b}[8], [x7]
+        ST1     {v0.b}[0], [x6]
+8:
+        # Restore d8-d15 from stack
+        LDP     d14, d15, [sp, 48]
+        LDP     d12, d13, [sp, 32]
+        LDP     d10, d11, [sp, 16]
+        LDP     d8, d9, [sp], 64
+        RET
+
+END_FUNCTION xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-igemm/4x16c4-aarch64-neondot-cortex-a55.S b/src/qs8-igemm/4x16c4-aarch64-neondot-cortex-a55.S
new file mode 100644
index 0000000..cae3fdf
--- /dev/null
+++ b/src/qs8-igemm/4x16c4-aarch64-neondot-cortex-a55.S
@@ -0,0 +1,698 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack/assembly.h>
+
+# void xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55(
+#     size_t mr,                 x0
+#     size_t nc,                 x1
+#     size_t kc,                 x2 / x0
+#     size_t ks,                 x3 / x9
+#     const int8_t**restrict a,  x4
+#     const int8_t* restrict w,  x5
+#     int8_t* restrict c,        x6
+#     size_t cm_stride,          x7
+#     size_t cn_stride,                  [sp] -> x10
+#     size_t a_offset,                   [sp + 8] -> x11
+#     const float* zero,                 [sp + 16] -> x12
+#     const xnn_f32_minmax_params params [sp + 24] -> x8
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0  x20  v0  v4
+# A1  x15  v1  v5
+# A2  x13  v2  v6
+# A3  x21  v3  v7
+# B    x5  v8  v9 v10 v11
+# C0   x6 v16 v20 v24 v28
+# C1  x16 v17 v21 v25 v29
+# C2  x17 v18 v22 v26 v30
+# C3   x7 v19 v23 v27 v31
+# unused v12 v13 v14 v15
+
+# x14 temp for A55 loads.
+
+BEGIN_FUNCTION xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55
+
+        # Clamp C pointers
+        CMP      x0, 2                // if mr < 2
+        LDP     x10, x11, [sp]        // Load cn_stride, a_offset
+        ADD     x16, x6, x7           // c1 = c0 + cm_stride
+        LDP     x12, x8, [sp, 16]     // Load zero, params pointer
+        CSEL    x16, x6,  x16, LO     //   c1 = c0
+        STP     x20, x21, [sp, -48]!  // Save x20-x21 on stack
+        ADD      x2, x2, 3            // kc = (kc + 3) & ~3
+        STP      d8,  d9, [sp, 16]    // Save d8-d11 on stack
+
+        ADD     x17, x16, x7          // c2 = c1 + cm_stride
+        STP     d10, d11, [sp, 32]
+                                      // if mr <= 2
+        CSEL    x17, x16, x17, LS     //   c2 = c1
+        BIC      x2, x2, 3
+
+        CMP      x0, 4                // if mr < 4
+        ADD      x7,  x17, x7         // c3 = c2 + cm_stride
+        CSEL     x7,  x17, x7, LO     //   c3 = c2
+
+        .p2align 3
+0:
+        # Load initial bias from w into accumulators
+        LDP     q16, q20, [x5], 32
+        MOV     v17.16b, v16.16b
+        MOV     v18.16b, v16.16b
+        LDP     q24, q28, [x5], 32
+        MOV     v19.16b, v16.16b
+        MOV     v21.16b, v20.16b
+        MOV     v22.16b, v20.16b
+        MOV     v23.16b, v20.16b
+        MOV     v25.16b, v24.16b
+        MOV     v26.16b, v24.16b
+        MOV     v27.16b, v24.16b
+        MOV     v29.16b, v28.16b
+        MOV     v30.16b, v28.16b
+        MOV     v31.16b, v28.16b
+        MOV     x9, x3  // p = ks
+
+        .p2align 3
+1:
+        # Load next 4 A pointers
+        LDP     x20, x15, [x4], 16
+        LDP     x13, x21, [x4], 16
+
+        CMP     x20, x12           // if a0 == zero
+        ADD     x20, x20, x11      // a0 += a_offset
+        CSEL    x20, x12, x20, EQ  //   a0 = zero, else += a0 + a_offset
+        CMP     x15, x12           // if a1 == zero
+        ADD     x15, x15, x11      // a1 += a_offset
+        CSEL    x15, x12, x15, EQ  //   a1 = zero, else += a1 + a_offset
+        CMP     x13, x12           // if a2 == zero
+        ADD     x13, x13, x11      // a2 += a_offset
+        CSEL    x13, x12, x13, EQ  //   a2 = zero, else += a2 + a_offset
+        CMP     x21, x12           // if a3 == zero
+        ADD     x21, x21, x11      // a3 += a_offset
+        CSEL    x21, x12, x21, EQ  //   a3 = zero, else += a3 + a_offset
+
+        # Is there at least 16 bytes for prologue/epilogue?
+        SUBS    x0, x2, 16         // k = kc - 16
+        B.LO    5f
+
+        # prologue - read A and B values for block 0 and 1
+        LDR      d0, [x20], 8
+        LDR      q8,  [x5], 16
+        LDR      d1, [x15], 8
+        LDR      d2, [x13], 8
+        LDR      d3, [x21], 8
+        SUBS     x0, x0, 16         // is there 16 for main loop?
+        LDR      d9,  [x5], 8
+        LDR     x14,  [x5], 8
+        # Is there at least 16 bytes for main loop?
+        B.LO    3f
+
+        # Main loop - 16 bytes of A in 4 groups.
+        # 4 row of 4 vectors wide = 16 sdot instructions for 4 channels
+        # 4 LD64 for A
+        # 4 LD128 for W. = 2 LD64 + INS.
+        # for each 4 sdot, 1 LD64 for A, 2 LD64 for W + INS.
+
+        .p2align 3
+2:
+        // BLOCK 0
+        SDOT    v16.4s,  v8.16b, v0.4b[0]
+        LDR     d10,  [x5], 8
+        SDOT    v17.4s,  v8.16b, v1.4b[0]
+        INS      v9.d[1], x14
+        SDOT    v18.4s,  v8.16b, v2.4b[0]
+        LDR     x14,  [x5], 8
+        SDOT    v19.4s,  v8.16b, v3.4b[0]
+        LDR      d4,  [x20], 8
+
+        // BLOCK 1
+        SDOT    v20.4s,  v9.16b, v0.4b[0]
+        LDR     d11,  [x5], 8
+        SDOT    v21.4s,  v9.16b, v1.4b[0]
+        INS     v10.d[1], x14
+        SDOT    v22.4s,  v9.16b, v2.4b[0]
+        LDR     x14,  [x5], 8
+        SDOT    v23.4s,  v9.16b, v3.4b[0]
+        LDR      d5, [x15], 8
+
+        // BLOCK 2
+        SDOT    v24.4s, v10.16b, v0.4b[0]
+        LDR      d8,  [x5], 8
+        SDOT    v25.4s, v10.16b, v1.4b[0]
+        INS     v11.d[1], x14
+        SDOT    v26.4s, v10.16b, v2.4b[0]
+        LDR     x14,  [x5], 8
+        SDOT    v27.4s, v10.16b, v3.4b[0]
+        LDR      d6, [x13], 8
+
+        // BLOCK 3
+        SDOT    v28.4s, v11.16b, v0.4b[0]
+        LDR      d9,  [x5], 8
+        SDOT    v29.4s, v11.16b, v1.4b[0]
+        INS      v8.d[1], x14
+        SDOT    v30.4s, v11.16b, v2.4b[0]
+        LDR     x14,  [x5], 8
+        SDOT    v31.4s, v11.16b, v3.4b[0]
+        LDR      d7,  [x21], 8
+
+        // BLOCK 0
+        SDOT    v16.4s,  v8.16b, v0.4b[1]
+        LDR     d10,  [x5], 8
+        SDOT    v17.4s,  v8.16b, v1.4b[1]
+        INS      v9.d[1], x14
+        SDOT    v18.4s,  v8.16b, v2.4b[1]
+        LDR     x14,  [x5], 8
+        SDOT    v19.4s,  v8.16b, v3.4b[1]
+
+        // BLOCK 1
+        SDOT    v20.4s,  v9.16b, v0.4b[1]
+        LDR     d11,  [x5], 8
+        SDOT    v21.4s,  v9.16b, v1.4b[1]
+        INS     v10.d[1], x14
+        SDOT    v22.4s,  v9.16b, v2.4b[1]
+        LDR     x14,  [x5], 8
+        SDOT    v23.4s,  v9.16b, v3.4b[1]
+
+        // BLOCK 2
+        SDOT    v24.4s, v10.16b, v0.4b[1]
+        LDR      d8,  [x5], 8
+        SDOT    v25.4s, v10.16b, v1.4b[1]
+        INS     v11.d[1], x14
+        SDOT    v26.4s, v10.16b, v2.4b[1]
+        LDR     x14,  [x5], 8
+        SDOT    v27.4s, v10.16b, v3.4b[1]
+
+        // BLOCK 4
+        SDOT    v28.4s, v11.16b, v0.4b[1]
+        LDR      d9,  [x5], 8
+        SDOT    v29.4s, v11.16b, v1.4b[1]
+        INS      v8.d[1], x14
+        SDOT    v30.4s, v11.16b, v2.4b[1]
+        LDR     x14,  [x5], 8
+        SDOT    v31.4s, v11.16b, v3.4b[1]
+
+        // BLOCK 0
+        SDOT    v16.4s,  v8.16b, v4.4b[0]
+        LDR     d10,  [x5], 8
+        SDOT    v17.4s,  v8.16b, v5.4b[0]
+        INS      v9.d[1], x14
+        SDOT    v18.4s,  v8.16b, v6.4b[0]
+        LDR     x14,  [x5], 8
+        SDOT    v19.4s,  v8.16b, v7.4b[0]
+        LDR      d0,  [x20], 8
+
+        // BLOCK 1
+        SDOT    v20.4s,  v9.16b, v4.4b[0]
+        LDR     d11,  [x5], 8
+        SDOT    v21.4s,  v9.16b, v5.4b[0]
+        INS     v10.d[1], x14
+        SDOT    v22.4s,  v9.16b, v6.4b[0]
+        LDR     x14,  [x5], 8
+        SDOT    v23.4s,  v9.16b, v7.4b[0]
+        LDR      d1, [x15], 8
+
+        // BLOCK 2
+        SDOT    v24.4s, v10.16b, v4.4b[0]
+        LDR      d8,  [x5], 8
+        SDOT    v25.4s, v10.16b, v5.4b[0]
+        INS     v11.d[1], x14
+        SDOT    v26.4s, v10.16b, v6.4b[0]
+        LDR     x14,  [x5], 8
+        SDOT    v27.4s, v10.16b, v7.4b[0]
+        LDR      d2, [x13], 8
+
+        // BLOCK 3
+        SDOT    v28.4s, v11.16b, v4.4b[0]
+        LDR      d9,  [x5], 8
+        SDOT    v29.4s, v11.16b, v5.4b[0]
+        INS      v8.d[1], x14
+        SDOT    v30.4s, v11.16b, v6.4b[0]
+        LDR     x14,  [x5], 8
+        SDOT    v31.4s, v11.16b, v7.4b[0]
+        LDR      d3,  [x21], 8
+
+        // BLOCK 0
+        SDOT    v16.4s,  v8.16b, v4.4b[1]
+        LDR     d10,  [x5], 8
+        SDOT    v17.4s,  v8.16b, v5.4b[1]
+        INS      v9.d[1], x14
+        SDOT    v18.4s,  v8.16b, v6.4b[1]
+        LDR     x14,  [x5], 8
+        SDOT    v19.4s,  v8.16b, v7.4b[1]
+
+        // BLOCK 1
+        SDOT    v20.4s,  v9.16b, v4.4b[1]
+        LDR     d11,  [x5], 8
+        SDOT    v21.4s,  v9.16b, v5.4b[1]
+        INS     v10.d[1], x14
+        SDOT    v22.4s,  v9.16b, v6.4b[1]
+        LDR     x14,  [x5], 8
+        SDOT    v23.4s,  v9.16b, v7.4b[1]
+
+        // BLOCK 2
+        SDOT    v24.4s, v10.16b, v4.4b[1]
+        LDR      d8,  [x5], 8   // First B values for block 0 and 1
+        SDOT    v25.4s, v10.16b, v5.4b[1]
+        INS     v11.d[1], x14
+        SDOT    v26.4s, v10.16b, v6.4b[1]
+        LDR     x14,  [x5], 8
+        SDOT    v27.4s, v10.16b, v7.4b[1]
+        SUBS    x0, x0, 16
+
+        // BLOCK 3
+        SDOT    v28.4s, v11.16b, v4.4b[1]
+        LDR      d9,  [x5], 8
+        SDOT    v29.4s, v11.16b, v5.4b[1]
+        INS      v8.d[1], x14
+        SDOT    v30.4s, v11.16b, v6.4b[1]
+        LDR     x14,  [x5], 8
+        SDOT    v31.4s, v11.16b, v7.4b[1]
+        B.HS    2b
+
+        # Epilogue.  Same as main loop but no preloads in final group
+3:
+        // BLOCK 0
+        SDOT    v16.4s,  v8.16b, v0.4b[0]
+        LDR     d10,  [x5], 8
+        SDOT    v17.4s,  v8.16b, v1.4b[0]
+        INS      v9.d[1], x14
+        SDOT    v18.4s,  v8.16b, v2.4b[0]
+        LDR     x14,  [x5], 8
+        SDOT    v19.4s,  v8.16b, v3.4b[0]
+        LDR      d4,  [x20], 8
+
+        // BLOCK 1
+        SDOT    v20.4s,  v9.16b, v0.4b[0]
+        LDR     d11,  [x5], 8
+        SDOT    v21.4s,  v9.16b, v1.4b[0]
+        INS     v10.d[1], x14
+        SDOT    v22.4s,  v9.16b, v2.4b[0]
+        LDR     x14,  [x5], 8
+        SDOT    v23.4s,  v9.16b, v3.4b[0]
+        LDR      d5, [x15], 8
+
+        // BLOCK 2
+        SDOT    v24.4s, v10.16b, v0.4b[0]
+        LDR      d8,  [x5], 8
+        SDOT    v25.4s, v10.16b, v1.4b[0]
+        INS     v11.d[1], x14
+        SDOT    v26.4s, v10.16b, v2.4b[0]
+        LDR     x14,  [x5], 8
+        SDOT    v27.4s, v10.16b, v3.4b[0]
+        LDR      d6, [x13], 8
+
+        // BLOCK 3
+        SDOT    v28.4s, v11.16b, v0.4b[0]
+        LDR      d9,  [x5], 8
+        SDOT    v29.4s, v11.16b, v1.4b[0]
+        INS      v8.d[1], x14
+        SDOT    v30.4s, v11.16b, v2.4b[0]
+        LDR     x14,  [x5], 8
+        SDOT    v31.4s, v11.16b, v3.4b[0]
+        LDR      d7,  [x21], 8
+
+        // BLOCK 0
+        SDOT    v16.4s,  v8.16b, v0.4b[1]
+        LDR     d10,  [x5], 8
+        SDOT    v17.4s,  v8.16b, v1.4b[1]
+        INS      v9.d[1], x14
+        SDOT    v18.4s,  v8.16b, v2.4b[1]
+        LDR     x14,  [x5], 8
+        SDOT    v19.4s,  v8.16b, v3.4b[1]
+
+        // BLOCK 1
+        SDOT    v20.4s,  v9.16b, v0.4b[1]
+        LDR     d11,  [x5], 8
+        SDOT    v21.4s,  v9.16b, v1.4b[1]
+        INS     v10.d[1], x14
+        SDOT    v22.4s,  v9.16b, v2.4b[1]
+        LDR     x14,  [x5], 8
+        SDOT    v23.4s,  v9.16b, v3.4b[1]
+
+        // BLOCK 2
+        SDOT    v24.4s, v10.16b, v0.4b[1]
+        LDR      d8,  [x5], 8
+        SDOT    v25.4s, v10.16b, v1.4b[1]
+        INS     v11.d[1], x14
+        SDOT    v26.4s, v10.16b, v2.4b[1]
+        LDR     x14,  [x5], 8
+        SDOT    v27.4s, v10.16b, v3.4b[1]
+
+        // BLOCK 4
+        SDOT    v28.4s, v11.16b, v0.4b[1]
+        LDR      d9,  [x5], 8
+        SDOT    v29.4s, v11.16b, v1.4b[1]
+        INS      v8.d[1], x14
+        SDOT    v30.4s, v11.16b, v2.4b[1]
+        LDR     x14,  [x5], 8
+        SDOT    v31.4s, v11.16b, v3.4b[1]
+
+        // BLOCK 0
+        SDOT    v16.4s,  v8.16b, v4.4b[0]
+        LDR     d10,  [x5], 8
+        SDOT    v17.4s,  v8.16b, v5.4b[0]
+        INS      v9.d[1], x14
+        SDOT    v18.4s,  v8.16b, v6.4b[0]
+        LDR     x14,  [x5], 8
+        SDOT    v19.4s,  v8.16b, v7.4b[0]
+
+        // BLOCK 1
+        SDOT    v20.4s,  v9.16b, v4.4b[0]
+        LDR     d11,  [x5], 8
+        SDOT    v21.4s,  v9.16b, v5.4b[0]
+        INS     v10.d[1], x14
+        SDOT    v22.4s,  v9.16b, v6.4b[0]
+        LDR     x14,  [x5], 8
+        SDOT    v23.4s,  v9.16b, v7.4b[0]
+
+        // BLOCK 2
+        SDOT    v24.4s, v10.16b, v4.4b[0]
+        LDR      d8,  [x5], 8
+        SDOT    v25.4s, v10.16b, v5.4b[0]
+        INS     v11.d[1], x14
+        SDOT    v26.4s, v10.16b, v6.4b[0]
+        LDR     x14,  [x5], 8
+        SDOT    v27.4s, v10.16b, v7.4b[0]
+
+        // BLOCK 3
+        SDOT    v28.4s, v11.16b, v4.4b[0]
+        LDR      d9,  [x5], 8
+        SDOT    v29.4s, v11.16b, v5.4b[0]
+        INS      v8.d[1], x14
+        SDOT    v30.4s, v11.16b, v6.4b[0]
+        LDR     x14,  [x5], 8
+        SDOT    v31.4s, v11.16b, v7.4b[0]
+
+        // BLOCK 0
+        SDOT    v16.4s,  v8.16b, v4.4b[1]
+        LDR     d10,  [x5], 8
+        SDOT    v17.4s,  v8.16b, v5.4b[1]
+        INS      v9.d[1], x14
+        SDOT    v18.4s,  v8.16b, v6.4b[1]
+        LDR     x14,  [x5], 8
+        SDOT    v19.4s,  v8.16b, v7.4b[1]
+
+        // BLOCK 1
+        SDOT    v20.4s,  v9.16b, v4.4b[1]
+        LDR     d11,  [x5], 8
+        SDOT    v21.4s,  v9.16b, v5.4b[1]
+        INS     v10.d[1], x14
+        SDOT    v22.4s,  v9.16b, v6.4b[1]
+        LDR     x14,  [x5], 8
+        SDOT    v23.4s,  v9.16b, v7.4b[1]
+
+        // BLOCK 2
+        SDOT    v24.4s, v10.16b, v4.4b[1]
+        SDOT    v25.4s, v10.16b, v5.4b[1]
+        INS     v11.d[1], x14
+        SDOT    v26.4s, v10.16b, v6.4b[1]
+        SDOT    v27.4s, v10.16b, v7.4b[1]
+        AND     x0, x2, 15        // kc remainder 0 to 12
+
+        // BLOCK 3
+        SDOT    v28.4s, v11.16b, v4.4b[1]
+        SDOT    v29.4s, v11.16b, v5.4b[1]
+        SDOT    v30.4s, v11.16b, v6.4b[1]
+        SDOT    v31.4s, v11.16b, v7.4b[1]
+
+        # Is there a remainder?- 4 to 12 bytes of A
+        CBNZ    x0, 6f
+
+        .p2align 3
+4:
+        # ks loop
+        SUBS    x9, x9, 32  // ks -= MR * sizeof(int8_t*)
+        B.HI    1b
+
+        # Apply params - scale, shift, bias and clamp
+        LD1R    {v0.4s}, [x8], 4
+        SQRDMULH     v4.4s, v16.4s, v0.4s
+        SQRDMULH     v5.4s, v17.4s, v0.4s
+        LD1R    {v1.4s}, [x8], 4
+        SQRDMULH     v6.4s, v18.4s, v0.4s
+        SQRDMULH     v7.4s, v19.4s, v0.4s
+        SQRDMULH     v8.4s, v20.4s, v0.4s
+        SQRDMULH     v9.4s, v21.4s, v0.4s
+        CMEQ    v2.4s, v1.4s, 0
+        SQRDMULH    v10.4s, v22.4s, v0.4s
+        SQRDMULH    v11.4s, v23.4s, v0.4s
+
+        BIC     v16.16b, v16.16b, v2.16b
+        BIC     v17.16b, v17.16b, v2.16b
+        BIC     v18.16b, v18.16b, v2.16b
+        BIC     v19.16b, v19.16b, v2.16b
+        BIC     v20.16b, v20.16b, v2.16b
+        BIC     v21.16b, v21.16b, v2.16b
+        BIC     v22.16b, v22.16b, v2.16b
+        BIC     v23.16b, v23.16b, v2.16b
+
+        SSRA     v4.4s, v16.4s, 31  // signed shift right accumulate
+        SSRA     v5.4s, v17.4s, 31
+        SSRA     v6.4s, v18.4s, 31
+        SSRA     v7.4s, v19.4s, 31
+        SSRA     v8.4s, v20.4s, 31
+        SSRA     v9.4s, v21.4s, 31
+        SSRA    v10.4s, v22.4s, 31
+        SSRA    v11.4s, v23.4s, 31
+
+        SQRDMULH  v16.4s, v24.4s, v0.4s
+        SQRDMULH  v17.4s, v25.4s, v0.4s
+        SQRDMULH  v18.4s, v26.4s, v0.4s
+        SQRDMULH  v19.4s, v27.4s, v0.4s
+        SQRDMULH  v20.4s, v28.4s, v0.4s
+        SQRDMULH  v21.4s, v29.4s, v0.4s
+        SQRDMULH  v22.4s, v30.4s, v0.4s
+        SQRDMULH  v23.4s, v31.4s, v0.4s
+
+        BIC     v24.16b, v24.16b, v2.16b
+        BIC     v25.16b, v25.16b, v2.16b
+        BIC     v26.16b, v26.16b, v2.16b
+        BIC     v27.16b, v27.16b, v2.16b
+        BIC     v28.16b, v28.16b, v2.16b
+        BIC     v29.16b, v29.16b, v2.16b
+        BIC     v30.16b, v30.16b, v2.16b
+        BIC     v31.16b, v31.16b, v2.16b
+
+        SSRA    v16.4s, v24.4s, 31
+        SSRA    v17.4s, v25.4s, 31
+        SSRA    v18.4s, v26.4s, 31
+        SSRA    v19.4s, v27.4s, 31
+        SSRA    v20.4s, v28.4s, 31
+        SSRA    v21.4s, v29.4s, 31
+        SSRA    v22.4s, v30.4s, 31
+        SSRA    v23.4s, v31.4s, 31
+
+        SRSHL    v4.4s,  v4.4s, v1.4s  // signed rounding shift left
+        SRSHL    v5.4s,  v5.4s, v1.4s
+        SRSHL    v6.4s,  v6.4s, v1.4s
+        SRSHL    v7.4s,  v7.4s, v1.4s
+        SRSHL    v8.4s,  v8.4s, v1.4s
+        SRSHL    v9.4s,  v9.4s, v1.4s
+        SRSHL   v10.4s, v10.4s, v1.4s
+        SRSHL   v11.4s, v11.4s, v1.4s
+
+        SRSHL   v16.4s, v16.4s, v1.4s
+        SRSHL   v17.4s, v17.4s, v1.4s
+        SRSHL   v18.4s, v18.4s, v1.4s
+        SRSHL   v19.4s, v19.4s, v1.4s
+        SRSHL   v20.4s, v20.4s, v1.4s
+        SRSHL   v21.4s, v21.4s, v1.4s
+        SRSHL   v22.4s, v22.4s, v1.4s
+        SRSHL   v23.4s, v23.4s, v1.4s
+
+        SQXTN    v4.4h,  v4.4s
+        SQXTN    v5.4h,  v5.4s
+        SQXTN    v6.4h,  v6.4s
+        SQXTN    v7.4h,  v7.4s
+        SQXTN   v16.4h, v16.4s
+        SQXTN   v17.4h, v17.4s
+        SQXTN   v18.4h, v18.4s
+        SQXTN   v19.4h, v19.4s
+        LD1R    {v2.8h}, [x8], 2   // add bias
+
+        SQXTN2   v4.8h,  v8.4s
+        SQXTN2   v5.8h,  v9.4s
+        SQXTN2   v6.8h, v10.4s
+        SQXTN2   v7.8h, v11.4s
+        SQXTN2  v16.8h, v20.4s
+        SQXTN2  v17.8h, v21.4s
+        SQXTN2  v18.8h, v22.4s
+        SQXTN2  v19.8h, v23.4s
+
+        SQADD    v4.8h,  v4.8h, v2.8h
+        SQADD    v5.8h,  v5.8h, v2.8h
+        SQADD    v6.8h,  v6.8h, v2.8h
+        SQADD    v7.8h,  v7.8h, v2.8h
+        SQADD   v16.8h, v16.8h, v2.8h
+        SQADD   v17.8h, v17.8h, v2.8h
+        SQADD   v18.8h, v18.8h, v2.8h
+        SQADD   v19.8h, v19.8h, v2.8h
+        LD1R    {v0.16b}, [x8], 1  // clamp min value
+
+        SQXTN    v4.8b,  v4.8h
+        SQXTN    v5.8b,  v5.8h
+        SQXTN    v6.8b,  v6.8h
+        SQXTN    v7.8b,  v7.8h
+        LD1R    {v1.16b}, [x8]     // clamp max value
+        SQXTN2   v4.16b, v16.8h
+        SQXTN2   v5.16b, v17.8h
+        SQXTN2   v6.16b, v18.8h
+        SQXTN2   v7.16b, v19.8h
+        SUB      x8, x8, 11       // rewind params pointer
+
+        SMAX     v4.16b,  v4.16b, v0.16b
+        SMAX     v5.16b,  v5.16b, v0.16b
+        SMAX     v6.16b,  v6.16b, v0.16b
+        SMAX     v7.16b,  v7.16b, v0.16b
+        SUBS    x1, x1, 16
+        SMIN     v4.16b,  v4.16b, v1.16b
+        SMIN     v5.16b,  v5.16b, v1.16b
+        SMIN     v6.16b,  v6.16b, v1.16b
+        SMIN     v7.16b,  v7.16b, v1.16b
+        B.LO    7f
+
+        # Store full 4 x 16
+        ST1     {v7.16b},  [x7], x10
+        ST1     {v6.16b}, [x17], x10
+        ST1     {v5.16b}, [x16], x10
+        ST1     {v4.16b},  [x6], x10
+
+        SUB     x4, x4, x3  // a -= ks
+
+        # nc loop
+        B.HI    0b
+
+        # Restore d8-d15 from stack
+        LDP     d10, d11, [sp, 32]
+        LDP      d8,  d9, [sp, 16]
+
+        # Restore x20-x21 from stack
+        LDP     x20, x21, [sp], 48
+        RET
+
+        # Remainder- 4 to 12 bytes of A
+        # Although C4, its safe to read 16 bytes.
+        .p2align 3
+5:
+        AND     x0, x2, 15        // kc remainder 4 to 12
+6:
+        LDR     q0, [x20]
+        LDP     q8,  q9,  [x5], 32
+        LDR     q1, [x15]
+        LDR     q2, [x13]
+        LDR     q3, [x21]
+        LDP     q10, q11, [x5], 32
+        SDOT    v16.4s,  v8.16b, v0.4b[0]
+        SDOT    v17.4s,  v8.16b, v1.4b[0]
+        SDOT    v18.4s,  v8.16b, v2.4b[0]
+        SDOT    v19.4s,  v8.16b, v3.4b[0]
+        SDOT    v20.4s,  v9.16b, v0.4b[0]
+        SDOT    v21.4s,  v9.16b, v1.4b[0]
+        SDOT    v22.4s,  v9.16b, v2.4b[0]
+        SDOT    v23.4s,  v9.16b, v3.4b[0]
+        SDOT    v24.4s, v10.16b, v0.4b[0]
+        SDOT    v25.4s, v10.16b, v1.4b[0]
+        SDOT    v26.4s, v10.16b, v2.4b[0]
+        SDOT    v27.4s, v10.16b, v3.4b[0]
+        SDOT    v28.4s, v11.16b, v0.4b[0]
+        SDOT    v29.4s, v11.16b, v1.4b[0]
+        SDOT    v30.4s, v11.16b, v2.4b[0]
+        SDOT    v31.4s, v11.16b, v3.4b[0]
+        CMP     x0, 4
+        B.LS    4b
+        LDP      q8,  q9,  [x5], 32
+        LDP     q10, q11,  [x5], 32
+        SDOT    v16.4s,  v8.16b, v0.4b[1]
+        SDOT    v17.4s,  v8.16b, v1.4b[1]
+        SDOT    v18.4s,  v8.16b, v2.4b[1]
+        SDOT    v19.4s,  v8.16b, v3.4b[1]
+        SDOT    v20.4s,  v9.16b, v0.4b[1]
+        SDOT    v21.4s,  v9.16b, v1.4b[1]
+        SDOT    v22.4s,  v9.16b, v2.4b[1]
+        SDOT    v23.4s,  v9.16b, v3.4b[1]
+        SDOT    v24.4s, v10.16b, v0.4b[1]
+        SDOT    v25.4s, v10.16b, v1.4b[1]
+        SDOT    v26.4s, v10.16b, v2.4b[1]
+        SDOT    v27.4s, v10.16b, v3.4b[1]
+        SDOT    v28.4s, v11.16b, v0.4b[1]
+        SDOT    v29.4s, v11.16b, v1.4b[1]
+        SDOT    v30.4s, v11.16b, v2.4b[1]
+        SDOT    v31.4s, v11.16b, v3.4b[1]
+        CMP     x0, 8
+        B.LS    4b
+        LDP       q8,  q9,  [x5], 32
+        LDP      q10, q11,  [x5], 32
+        SDOT    v16.4s,  v8.16b, v0.4b[2]
+        SDOT    v17.4s,  v8.16b, v1.4b[2]
+        SDOT    v18.4s,  v8.16b, v2.4b[2]
+        SDOT    v19.4s,  v8.16b, v3.4b[2]
+        SDOT    v20.4s,  v9.16b, v0.4b[2]
+        SDOT    v21.4s,  v9.16b, v1.4b[2]
+        SDOT    v22.4s,  v9.16b, v2.4b[2]
+        SDOT    v23.4s,  v9.16b, v3.4b[2]
+        SDOT    v24.4s, v10.16b, v0.4b[2]
+        SDOT    v25.4s, v10.16b, v1.4b[2]
+        SDOT    v26.4s, v10.16b, v2.4b[2]
+        SDOT    v27.4s, v10.16b, v3.4b[2]
+        SDOT    v28.4s, v11.16b, v0.4b[2]
+        SDOT    v29.4s, v11.16b, v1.4b[2]
+        SDOT    v30.4s, v11.16b, v2.4b[2]
+        SDOT    v31.4s, v11.16b, v3.4b[2]
+        B       4b
+
+        # Store odd width
+        .p2align 3
+7:
+        TBZ     x1, 3, 8f
+        STR     d7, [x7], 8
+        DUP     d7, v7.d[1]
+        STR     d6, [x17], 8
+        DUP     d6, v6.d[1]
+        STR     d5, [x16], 8
+        DUP     d5, v5.d[1]
+        STR     d4, [x6], 8
+        DUP     d4, v4.d[1]
+8:
+        TBZ     x1, 2, 9f
+        STR     s7, [x7], 4
+        DUP     s7, v7.s[1]
+        STR     s6, [x17], 4
+        DUP     s6, v6.s[1]
+        STR     s5, [x16], 4
+        DUP     s5, v5.s[1]
+        STR     s4, [x6], 4
+        DUP     s4, v4.s[1]
+9:
+        TBZ     x1, 1, 10f
+        ST1     {v7.h}[0], [x7], 2
+        DUP      h7, v7.h[1]
+        ST1     {v6.h}[0], [x17], 2
+        DUP      h6, v6.h[1]
+        ST1     {v5.h}[0], [x16], 2
+        DUP      h5, v5.h[1]
+        ST1     {v4.h}[0], [x6], 2
+        DUP      h4, v4.h[1]
+10:
+        TBZ     x1, 0, 11f
+        ST1     {v7.b}[0], [x7]
+        ST1     {v6.b}[0], [x17]
+        ST1     {v5.b}[0], [x16]
+        ST1     {v4.b}[0], [x6]
+11:
+        # Restore d8-d15 from stack
+        LDP     d10, d11, [sp, 32]
+        LDP      d8,  d9, [sp, 16]
+
+        # Restore x20-x21 from stack
+        LDP     x20, x21, [sp], 48
+        RET
+
+END_FUNCTION xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-igemm/4x16c4-aarch64-neondot-ld64.S b/src/qs8-igemm/4x16c4-aarch64-neondot-ld64.S
new file mode 100644
index 0000000..24da63f
--- /dev/null
+++ b/src/qs8-igemm/4x16c4-aarch64-neondot-ld64.S
@@ -0,0 +1,379 @@
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack/assembly.h>
+
+# void xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64(
+#     size_t mr,                 x0
+#     size_t nc,                 x1
+#     size_t kc,                 x2 / x0
+#     size_t ks,                 x3 / x9
+#     const int8_t**restrict a,  x4
+#     const int8_t* restrict w,  x5
+#     int8_t* restrict c,        x6
+#     size_t cm_stride,          x7
+#     size_t cn_stride,                  [sp] -> x10
+#     size_t a_offset,                   [sp + 8] -> x11
+#     const float* zero,                 [sp + 16] -> x12
+#     const xnn_f32_minmax_params params [sp + 24] -> x8
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0  x20  v0
+# A1  x15  v1
+# A2  x13  v2
+# A3  x21  v3
+# B    x5  v4  v5  v6  v7
+# C0   x6 v16 v20 v24 v28
+# C1  x16 v17 v21 v25 v29
+# C2  x17 v18 v22 v26 v30
+# C3   x7 v19 v23 v27 v31
+# unused v8 v9 v10 v11 v12 v13 v14 v15
+
+BEGIN_FUNCTION xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64
+
+        # Clamp C pointers
+        CMP      x0, 2                // if mr < 2
+        LDP     x10, x11, [sp]        // Load cn_stride, a_offset
+        ADD     x16, x6, x7           // c1 = c0 + cm_stride
+        CSEL    x16, x6,  x16, LO     //   c1 = c0
+        ADD      x2, x2, 3            // kc = (kc + 3) & ~3
+
+        ADD     x17, x16, x7          // c2 = c1 + cm_stride
+        LDP     x12, x8, [sp, 16]     // Load zero, params pointer
+                                      // if mr <= 2
+        CSEL    x17, x16, x17, LS     //   c2 = c1
+        BIC      x2, x2, 3
+
+        CMP      x0, 4                // if mr < 4
+        STP     x20, x21, [sp, -16]!  // Save x20-x21 on stack
+        ADD      x7,  x17, x7         // c3 = c2 + cm_stride
+        CSEL     x7,  x17, x7, LO     //   c3 = c2
+
+        .p2align 3
+0:
+        # Load initial bias from w into accumulators
+        LDP     q16, q20, [x5], 32
+        MOV     v17.16b, v16.16b
+        MOV     v18.16b, v16.16b
+        LDP     q24, q28, [x5], 32
+        MOV     v19.16b, v16.16b
+        MOV     v21.16b, v20.16b
+        MOV     v22.16b, v20.16b
+        MOV     v23.16b, v20.16b
+        MOV     v25.16b, v24.16b
+        MOV     v26.16b, v24.16b
+        MOV     v27.16b, v24.16b
+        MOV     v29.16b, v28.16b
+        MOV     v30.16b, v28.16b
+        MOV     v31.16b, v28.16b
+        MOV     x9, x3  // p = ks
+
+        .p2align 3
+1:
+        # Load next 4 A pointers
+        LDP     x20, x15, [x4], 16
+        LDP     x13, x21, [x4], 16
+
+        CMP     x20, x12           // if a0 == zero
+        ADD     x20, x20, x11      // a0 += a_offset
+        CSEL    x20, x12, x20, EQ  //   a0 = zero, else += a0 + a_offset
+        CMP     x15, x12           // if a1 == zero
+        ADD     x15, x15, x11      // a1 += a_offset
+        CSEL    x15, x12, x15, EQ  //   a1 = zero, else += a1 + a_offset
+        CMP     x13, x12           // if a2 == zero
+        ADD     x13, x13, x11      // a2 += a_offset
+        CSEL    x13, x12, x13, EQ  //   a2 = zero, else += a2 + a_offset
+        CMP     x21, x12           // if a3 == zero
+        ADD     x21, x21, x11      // a3 += a_offset
+        CSEL    x21, x12, x21, EQ  //   a3 = zero, else += a3 + a_offset
+
+        # Is there at least 8 bytes for main loop?
+        SUBS    x0, x2, 8          // k = kc - 8
+        B.LO    4f
+
+        # Main loop - 8 bytes of A
+        .p2align 3
+2:
+        LDR     d0, [x20], 8
+        LDR     q4,  [x5], 16
+        LDR     d1, [x15], 8
+        LDR     d2, [x13], 8
+        LDR     d3, [x21], 8
+        LDR     q5,  [x5], 16
+        SDOT    v16.4s, v4.16b,  v0.4b[0]
+        SDOT    v17.4s, v4.16b,  v1.4b[0]
+        LDP     q6, q7, [x5], 32
+        SDOT    v18.4s, v4.16b,  v2.4b[0]
+        SDOT    v19.4s, v4.16b,  v3.4b[0]
+        SDOT    v20.4s, v5.16b,  v0.4b[0]
+        SDOT    v21.4s, v5.16b,  v1.4b[0]
+        SDOT    v22.4s, v5.16b,  v2.4b[0]
+        SDOT    v23.4s, v5.16b,  v3.4b[0]
+        SDOT    v24.4s, v6.16b, v0.4b[0]
+        SDOT    v25.4s, v6.16b, v1.4b[0]
+        LDP     q4, q5, [x5], 32
+        SDOT    v26.4s, v6.16b, v2.4b[0]
+        SDOT    v27.4s, v6.16b, v3.4b[0]
+        SDOT    v28.4s, v7.16b, v0.4b[0]
+        SDOT    v29.4s, v7.16b, v1.4b[0]
+        SDOT    v30.4s, v7.16b, v2.4b[0]
+        SDOT    v31.4s, v7.16b, v3.4b[0]
+        SDOT    v16.4s, v4.16b,  v0.4b[1]
+        SDOT    v17.4s, v4.16b,  v1.4b[1]
+        LDP     q6, q7, [x5], 32
+        SDOT    v18.4s, v4.16b,  v2.4b[1]
+        SDOT    v19.4s, v4.16b,  v3.4b[1]
+        SDOT    v20.4s, v5.16b,  v0.4b[1]
+        SDOT    v21.4s, v5.16b,  v1.4b[1]
+        SDOT    v22.4s, v5.16b,  v2.4b[1]
+        SDOT    v23.4s, v5.16b,  v3.4b[1]
+        SDOT    v24.4s, v6.16b,  v0.4b[1]
+        SDOT    v25.4s, v6.16b,  v1.4b[1]
+        SDOT    v26.4s, v6.16b,  v2.4b[1]
+        SDOT    v27.4s, v6.16b,  v3.4b[1]
+        SDOT    v28.4s, v7.16b,  v0.4b[1]
+        SDOT    v29.4s, v7.16b,  v1.4b[1]
+        SDOT    v30.4s, v7.16b,  v2.4b[1]
+        SUBS    x0, x0, 8
+        SDOT    v31.4s, v7.16b,  v3.4b[1]
+        B.HS    2b
+
+        # Is there a remainder?- 4 bytes of A
+        TBNZ    x0, 2, 4f
+
+        # ks loop
+        SUBS    x9, x9, 32  // ks -= MR * sizeof(int8_t*)
+        B.HI    1b
+
+3:
+        # Apply params - scale, shift, bias and clamp
+        LD2R    {v0.4s, v1.4s}, [x8], 8
+        CMEQ    v2.4s, v1.4s, 0
+
+        BIC     v4.16b, v16.16b, v2.16b
+        BIC     v5.16b, v17.16b, v2.16b
+        BIC     v6.16b, v18.16b, v2.16b
+        BIC     v7.16b, v19.16b, v2.16b
+
+        SQRDMULH  v16.4s, v16.4s, v0.4s
+        SQRDMULH  v17.4s, v17.4s, v0.4s
+        SQRDMULH  v18.4s, v18.4s, v0.4s
+        SQRDMULH  v19.4s, v19.4s, v0.4s
+
+        SSRA    v16.4s, v4.4s, 31  // signed shift right accumulate
+        SSRA    v17.4s, v5.4s, 31
+        SSRA    v18.4s, v6.4s, 31
+        SSRA    v19.4s, v7.4s, 31
+
+        BIC     v4.16b, v20.16b, v2.16b
+        BIC     v5.16b, v21.16b, v2.16b
+        BIC     v6.16b, v22.16b, v2.16b
+        BIC     v7.16b, v23.16b, v2.16b
+
+        SQRDMULH  v20.4s, v20.4s, v0.4s
+        SQRDMULH  v21.4s, v21.4s, v0.4s
+        SQRDMULH  v22.4s, v22.4s, v0.4s
+        SQRDMULH  v23.4s, v23.4s, v0.4s
+
+        SSRA    v20.4s, v4.4s, 31
+        SSRA    v21.4s, v5.4s, 31
+        SSRA    v22.4s, v6.4s, 31
+        SSRA    v23.4s, v7.4s, 31
+
+        BIC     v4.16b, v24.16b, v2.16b
+        BIC     v5.16b, v25.16b, v2.16b
+        BIC     v6.16b, v26.16b, v2.16b
+        BIC     v7.16b, v27.16b, v2.16b
+
+        SQRDMULH  v24.4s, v24.4s, v0.4s
+        SQRDMULH  v25.4s, v25.4s, v0.4s
+        SQRDMULH  v26.4s, v26.4s, v0.4s
+        SQRDMULH  v27.4s, v27.4s, v0.4s
+
+        SSRA    v24.4s, v4.4s, 31
+        SSRA    v25.4s, v5.4s, 31
+        SSRA    v26.4s, v6.4s, 31
+        SSRA    v27.4s, v7.4s, 31
+
+        BIC     v4.16b, v28.16b, v2.16b
+        BIC     v5.16b, v29.16b, v2.16b
+        BIC     v6.16b, v30.16b, v2.16b
+        BIC     v7.16b, v31.16b, v2.16b
+
+        SQRDMULH  v28.4s, v28.4s, v0.4s
+        SQRDMULH  v29.4s, v29.4s, v0.4s
+        SQRDMULH  v30.4s, v30.4s, v0.4s
+        SQRDMULH  v31.4s, v31.4s, v0.4s
+
+        SSRA    v28.4s, v4.4s, 31
+        SSRA    v29.4s, v5.4s, 31
+        SSRA    v30.4s, v6.4s, 31
+        SSRA    v31.4s, v7.4s, 31
+
+        SRSHL   v16.4s, v16.4s, v1.4s  // signed rounding shift left
+        SRSHL   v17.4s, v17.4s, v1.4s
+        SRSHL   v18.4s, v18.4s, v1.4s
+        SRSHL   v19.4s, v19.4s, v1.4s
+        SRSHL   v20.4s, v20.4s, v1.4s
+        SRSHL   v21.4s, v21.4s, v1.4s
+        SRSHL   v22.4s, v22.4s, v1.4s
+        SRSHL   v23.4s, v23.4s, v1.4s
+        SRSHL   v24.4s, v24.4s, v1.4s
+        SRSHL   v25.4s, v25.4s, v1.4s
+        SRSHL   v26.4s, v26.4s, v1.4s
+        SRSHL   v27.4s, v27.4s, v1.4s
+        SRSHL   v28.4s, v28.4s, v1.4s
+        SRSHL   v29.4s, v29.4s, v1.4s
+        SRSHL   v30.4s, v30.4s, v1.4s
+        SRSHL   v31.4s, v31.4s, v1.4s
+
+        SQXTN   v16.4h, v16.4s
+        SQXTN   v17.4h, v17.4s
+        SQXTN   v18.4h, v18.4s
+        SQXTN   v19.4h, v19.4s
+        SQXTN   v24.4h, v24.4s
+        SQXTN   v25.4h, v25.4s
+        SQXTN   v26.4h, v26.4s
+        SQXTN   v27.4h, v27.4s
+        LD1R    {v2.8h}, [x8], 2   // add bias
+
+        SQXTN2  v16.8h, v20.4s
+        SQXTN2  v17.8h, v21.4s
+        SQXTN2  v18.8h, v22.4s
+        SQXTN2  v19.8h, v23.4s
+        SQXTN2  v24.8h, v28.4s
+        SQXTN2  v25.8h, v29.4s
+        SQXTN2  v26.8h, v30.4s
+        SQXTN2  v27.8h, v31.4s
+
+        SQADD   v16.8h, v16.8h, v2.8h
+        SQADD   v17.8h, v17.8h, v2.8h
+        SQADD   v18.8h, v18.8h, v2.8h
+        SQADD   v19.8h, v19.8h, v2.8h
+        SQADD   v24.8h, v24.8h, v2.8h
+        SQADD   v25.8h, v25.8h, v2.8h
+        SQADD   v26.8h, v26.8h, v2.8h
+        SQADD   v27.8h, v27.8h, v2.8h
+        LD1R    {v0.16b}, [x8], 1  // clamp min value
+
+        SQXTN    v4.8b, v16.8h
+        SQXTN    v5.8b, v17.8h
+        SQXTN    v6.8b, v18.8h
+        SQXTN    v7.8b, v19.8h
+        LD1R    {v1.16b}, [x8]     // clamp max value
+        SQXTN2   v4.16b, v24.8h
+        SQXTN2   v5.16b, v25.8h
+        SQXTN2   v6.16b, v26.8h
+        SQXTN2   v7.16b, v27.8h
+        SUB      x8, x8, 11       // rewind params pointer
+
+        SMAX    v4.16b, v4.16b, v0.16b
+        SMAX    v5.16b, v5.16b, v0.16b
+        SMAX    v6.16b, v6.16b, v0.16b
+        SMAX    v7.16b, v7.16b, v0.16b
+        SUBS    x1, x1, 16
+        SMIN    v4.16b, v4.16b, v1.16b
+        SMIN    v5.16b, v5.16b, v1.16b
+        SMIN    v6.16b, v6.16b, v1.16b
+        SMIN    v7.16b, v7.16b, v1.16b
+        B.LO    5f
+
+        # Store full 4 x 16
+        ST1     {v7.16b},  [x7], x10
+        ST1     {v6.16b}, [x17], x10
+        ST1     {v5.16b}, [x16], x10
+        ST1     {v4.16b},  [x6], x10
+
+        SUB     x4, x4, x3  // a -= ks
+
+        # nc loop
+        B.HI    0b
+
+        # Restore x20-x21 from stack
+        LDP     x20, x21, [sp], 16
+        RET
+
+        # Remainder- 4 bytes of A
+        .p2align 3
+4:
+        LDR     s0, [x20], 4
+        LDR     q4,  [x5], 16
+        LDR     s1, [x15], 4
+        LDR     s2, [x13], 4
+        LDR     s3, [x21], 4
+        LDR     q5,  [x5], 16
+        SDOT    v16.4s, v4.16b,  v0.4b[0]
+        SDOT    v17.4s, v4.16b,  v1.4b[0]
+        LDP     q6, q7, [x5], 32
+        SDOT    v18.4s, v4.16b,  v2.4b[0]
+        SDOT    v19.4s, v4.16b,  v3.4b[0]
+        SDOT    v20.4s, v5.16b,  v0.4b[0]
+        SDOT    v21.4s, v5.16b,  v1.4b[0]
+        SDOT    v22.4s, v5.16b,  v2.4b[0]
+        SDOT    v23.4s, v5.16b,  v3.4b[0]
+        SDOT    v24.4s, v6.16b, v0.4b[0]
+        SDOT    v25.4s, v6.16b, v1.4b[0]
+        SDOT    v26.4s, v6.16b, v2.4b[0]
+        SDOT    v27.4s, v6.16b, v3.4b[0]
+        SDOT    v28.4s, v7.16b, v0.4b[0]
+        SDOT    v29.4s, v7.16b, v1.4b[0]
+        SDOT    v30.4s, v7.16b, v2.4b[0]
+        SDOT    v31.4s, v7.16b, v3.4b[0]
+
+        # ks loop
+        SUBS    x9, x9, 32  // ks -= MR * sizeof(int8_t*)
+        B.HI    1b
+        B       3b
+
+        # Store odd width
+        .p2align 3
+5:
+        TBZ     x1, 3, 6f
+        STR     d7, [x7], 8
+        DUP     d7, v7.d[1]
+        STR     d6, [x17], 8
+        DUP     d6, v6.d[1]
+        STR     d5, [x16], 8
+        DUP     d5, v5.d[1]
+        STR     d4, [x6], 8
+        DUP     d4, v4.d[1]
+6:
+        TBZ     x1, 2, 7f
+        STR     s7, [x7], 4
+        DUP     s7, v7.s[1]
+        STR     s6, [x17], 4
+        DUP     s6, v6.s[1]
+        STR     s5, [x16], 4
+        DUP     s5, v5.s[1]
+        STR     s4, [x6], 4
+        DUP     s4, v4.s[1]
+7:
+        TBZ     x1, 1, 8f
+        ST1     {v7.h}[0], [x7], 2
+        DUP      h7, v7.h[1]
+        ST1     {v6.h}[0], [x17], 2
+        DUP      h6, v6.h[1]
+        ST1     {v5.h}[0], [x16], 2
+        DUP      h5, v5.h[1]
+        ST1     {v4.h}[0], [x6], 2
+        DUP      h4, v4.h[1]
+8:
+        TBZ     x1, 0, 9f
+        ST1     {v7.b}[0], [x7]
+        ST1     {v6.b}[0], [x17]
+        ST1     {v5.b}[0], [x16]
+        ST1     {v4.b}[0], [x6]
+9:
+        # Restore x20-x21 from stack
+        LDP x20, x21, [sp], 16
+        RET
+
+END_FUNCTION xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-igemm/MRx16c8-avx512skx.c.in b/src/qs8-igemm/MRx16c8-avx512skx.c.in
index 0244207..af17147 100644
--- a/src/qs8-igemm/MRx16c8-avx512skx.c.in
+++ b/src/qs8-igemm/MRx16c8-avx512skx.c.in
@@ -12,6 +12,7 @@
 
 #include <xnnpack/igemm.h>
 #include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
 
 
 $GEMM_SUFFIX = "_xw" if VARIANT == "EXTENDED" else ""
@@ -38,6 +39,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   $for M in range(1, MR):
     int8_t* c${M} = (int8_t*) ((uintptr_t) c${M-1} + cm_stride);
diff --git a/src/qs8-igemm/MRx4c2-sse.c.in b/src/qs8-igemm/MRx4c2-sse.c.in
index b917f66..d111fa1 100644
--- a/src/qs8-igemm/MRx4c2-sse.c.in
+++ b/src/qs8-igemm/MRx4c2-sse.c.in
@@ -18,6 +18,7 @@
   #include <${SSE_HEADER}>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 $ISA = {2: "sse2", 3: "ssse3", 4: "sse41", 5: "xop"}[SSE]
@@ -46,6 +47,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   int8_t* c0 = c;
   $for M in range(1, MR):
     int8_t* c${M} = (int8_t*) ((uintptr_t) c${M-1} + cm_stride);
@@ -180,20 +182,6 @@
               $else:
                 vacc${M}x0123 = _mm_add_epi32(vacc${M}x0123,
                   _mm_madd_epi16(_mm_shuffle_epi32(vxa${M}, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-            if (k > 6 * sizeof(int8_t)) {
-              const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-              w = (const void*) ((uintptr_t) w + 8);
-              const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-
-              $for M in range(MR):
-                $if SSE == 5:
-                  vacc${M}x0123 = _mm_maddd_epi16(
-                    _mm_shuffle_epi32(vxa${M}, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc${M}x0123);
-                $else:
-                  vacc${M}x0123 = _mm_add_epi32(vacc${M}x0123,
-                    _mm_madd_epi16(_mm_shuffle_epi32(vxa${M}, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            }
           }
         }
       }
diff --git a/src/qs8-igemm/MRx4c8-sse.c.in b/src/qs8-igemm/MRx4c8-sse.c.in
index 6a9b5df..030d144 100644
--- a/src/qs8-igemm/MRx4c8-sse.c.in
+++ b/src/qs8-igemm/MRx4c8-sse.c.in
@@ -18,6 +18,7 @@
   #include <${SSE_HEADER}>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 $ISA = {2: "sse2", 3: "ssse3", 4: "sse41", 5: "xop"}[SSE]
@@ -46,6 +47,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   $for M in range(1, MR):
     int8_t* c${M} = (int8_t*) ((uintptr_t) c${M-1} + cm_stride);
diff --git a/src/qs8-igemm/MRx4c8-wasmsimd.c.in b/src/qs8-igemm/MRx4c8-wasmsimd.c.in
index 51dc5cb..77282fc 100644
--- a/src/qs8-igemm/MRx4c8-wasmsimd.c.in
+++ b/src/qs8-igemm/MRx4c8-wasmsimd.c.in
@@ -10,6 +10,7 @@
 #include <wasm_simd128.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 $LOAD_SUFFIX = {"LD128": "_ld128", "LD64": "_ld64", "EXTENDED": ""}[VARIANT]
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   $for M in range(1, MR):
     int8_t* c${M} = (int8_t*) ((uintptr_t) c${M-1} + cm_stride);
diff --git a/src/qs8-igemm/MRx8c8-avx2.c.in b/src/qs8-igemm/MRx8c8-avx2.c.in
index f30a5e8..ed18162 100644
--- a/src/qs8-igemm/MRx8c8-avx2.c.in
+++ b/src/qs8-igemm/MRx8c8-avx2.c.in
@@ -10,6 +10,7 @@
 
 #include <xnnpack/igemm.h>
 #include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_${MR}x8c8__avx2(
@@ -37,6 +38,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   $for M in range(1, MR):
     int8_t* c${M} = (int8_t*) ((uintptr_t) c${M-1} + cm_stride);
diff --git a/src/qs8-igemm/MRxNRc4-neondot.c.in b/src/qs8-igemm/MRxNRc4-neondot.c.in
index 5aa2cfd..9cf885c 100644
--- a/src/qs8-igemm/MRxNRc4-neondot.c.in
+++ b/src/qs8-igemm/MRxNRc4-neondot.c.in
@@ -10,8 +10,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_${MR}x${NR}c4__neondot(
@@ -39,6 +39,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 4);
   int8_t* c0 = c;
   $for M in range(1, MR):
     int8_t* c${M} = (int8_t*) ((uintptr_t) c${M-1} + cm_stride);
@@ -82,7 +83,7 @@
         // Load a 8x${NR} block of weights.
         $for K in range(0, 8, 4):
           $for N in range(0, NR, 4):
-            const int8x16_t vb${ABC[K:K+4]}x${ABC[N:N+4]} = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
+            const int8x16_t vb${ABC[K:K+4]}x${ABC[N:N+4]} = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
 
         // Multiply-accumulate: ${MR}x8 * 8x${NR} --> ${MR}x${NR}.
         $for K in range(0, 8, 4):
@@ -92,7 +93,7 @@
 
         k -= 8 * sizeof(int8_t);
       }
-      // Handle up to 7 final positions of `k`
+      // Handle up to 4 final positions of `k`
       if XNN_UNLIKELY(k != 0) {
         // Load a ${MR}x4 block of activations.
         $for M in range(MR):
@@ -100,23 +101,12 @@
 
         // Load a 4x${NR} block of weights.
         $for N in range(0, NR, 4):
-          const int8x16_t vb0123x${ABC[N:N+4]} = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
+          const int8x16_t vb0123x${ABC[N:N+4]} = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
 
         // Multiply-accumulate: ${MR}x4 * 4x${NR} --> ${MR}x${NR}.
         $for M in range(MR):
           $for N in range(0, NR, 4):
               vacc${M}x${ABC[N:N+4]} = vdotq_lane_s32(vacc${M}x${ABC[N:N+4]}, vb0123x${ABC[N:N+4]}, va${M}x01234567, 0);
-
-        if (k > 4) {
-          // Load a 4x${NR} block of weights.
-          $for N in range(0, NR, 4):
-            const int8x16_t vb4567x${ABC[N:N+4]} = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-
-          // Multiply-accumulate: ${MR}x4 * 4x${NR} --> ${MR}x${NR}.
-          $for M in range(MR):
-            $for N in range(0, NR, 4):
-                vacc${M}x${ABC[N:N+4]} = vdotq_lane_s32(vacc${M}x${ABC[N:N+4]}, vb4567x${ABC[N:N+4]}, va${M}x01234567, 1);
-        }
       }
       p -= ${MR} * sizeof(void*);
     } while (p != 0);
diff --git a/src/qs8-igemm/c16-neon-mlal-padal.c.in b/src/qs8-igemm/c16-neon-mlal-padal.c.in
new file mode 100644
index 0000000..1d95ae3
--- /dev/null
+++ b/src/qs8-igemm/c16-neon-mlal-padal.c.in
@@ -0,0 +1,266 @@
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+$assert NR % 8 == 0
+$assert 8 <= NR <= 16
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_${MR}x${NR}c16__neon_mlal_padal(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= ${MR});
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (${MR} * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 16);
+  int8_t* c0 = c;
+  $for M in range(1, MR):
+    int8_t* c${M} = (int8_t*) ((uintptr_t) c${M-1} + cm_stride);
+    $if M % 2 == 0:
+      if XNN_UNPREDICTABLE(mr <= ${M}) {
+        c${M} = c${M-1};
+      }
+    $elif M + 1 == MR:
+      if XNN_UNPREDICTABLE(mr != ${M+1}) {
+        c${M} = c${M-1};
+      }
+    $else:
+      if XNN_UNPREDICTABLE(mr < ${M+1}) {
+        c${M} = c${M-1};
+      }
+
+  do {
+    $for N in range(NR):
+      int32x4_t vacc0x${N} = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    $for M in range(1, MR):
+      $for N in range(NR):
+        int32x4_t vacc${M}x${N} = vacc0x${N};
+
+    size_t p = ks;
+    do {
+      $for M in range(MR):
+        const int8_t* restrict a${M} = a[${M}];
+        if XNN_UNPREDICTABLE(a${M} != zero) {
+          a${M} = (const int8_t*) ((uintptr_t) a${M} + a_offset);
+        }
+      a += ${MR};
+
+      // KC loop of 16 with up to 15 remainder
+      size_t k = 0;
+      while (k < kc) {
+        $for M in range(MR):
+          const int8x16_t va${M} = vld1q_s8(a${M}); a${M} += 16;
+
+        $for N in range(NR):
+          const int8x16_t vb${N} = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+
+        $for N in range(NR):
+          $for M in range(MR):
+            int16x8_t vprod${M}x${N} = vmull_s8(vget_low_s8(vb${N}), vget_low_s8(va${M}));
+          $for M in range(MR):
+            vprod${M}x${N} = vmlal_s8(vprod${M}x${N}, vget_high_s8(vb${N}), vget_high_s8(va${M}));
+          $for M in range(MR):
+            vacc${M}x${N} = vpadalq_s16(vacc${M}x${N}, vprod${M}x${N});
+
+        k += 16 * sizeof(int8_t);
+      }
+
+      p -= ${MR} * sizeof(void*);
+    } while (p != 0);
+
+#if XNN_ARCH_ARM64
+    $for M in range(MR):
+      $for N in range(0, NR, 4):
+        const int32x4_t vsum${M}x${ABC[N:N+2]} = vpaddq_s32(vacc${M}x${N}, vacc${M}x${N+1});
+        const int32x4_t vsum${M}x${ABC[N+2:N+4]} = vpaddq_s32(vacc${M}x${N+2}, vacc${M}x${N+3});
+    $for M in range(MR):
+      $for N in range(0, NR, 4):
+        int32x4_t vacc${M}x${ABC[N:N+4]} = vpaddq_s32(vsum${M}x${ABC[N:N+2]}, vsum${M}x${ABC[N+2:N+4]});
+#else
+    $for M in range(MR):
+      $for N in range(0, NR, 4):
+        const int32x2_t vpsum${M}x${ABC[N]} = vadd_s32(vget_low_s32(vacc${M}x${N}), vget_high_s32(vacc${M}x${N}));
+        const int32x2_t vpsum${M}x${ABC[N+1]} = vadd_s32(vget_low_s32(vacc${M}x${N+1}), vget_high_s32(vacc${M}x${N+1}));
+        const int32x2_t vpsum${M}x${ABC[N+2]} = vadd_s32(vget_low_s32(vacc${M}x${N+2}), vget_high_s32(vacc${M}x${N+2}));
+        const int32x2_t vpsum${M}x${ABC[N+3]} = vadd_s32(vget_low_s32(vacc${M}x${N+3}), vget_high_s32(vacc${M}x${N+3}));
+        const int32x2_t vsum${M}x${ABC[N:N+2]} = vpadd_s32(vpsum${M}x${ABC[N]}, vpsum${M}x${ABC[N+1]});
+        const int32x2_t vsum${M}x${ABC[N+2:N+4]} = vpadd_s32(vpsum${M}x${ABC[N+2]}, vpsum${M}x${ABC[N+3]});
+        int32x4_t vacc${M}x${ABC[N:N+4]} = vcombine_s32(vsum${M}x${ABC[N:N+2]}, vsum${M}x${ABC[N+2:N+4]} );
+#endif
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    $for M in range(MR):
+      $for N in range(0, NR, 4):
+        vacc${M}x${ABC[N:N+4]} = vqrdmulhq_s32(vacc${M}x${ABC[N:N+4]}, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    $for M in range(MR):
+      $for N in range(0, NR, 4):
+        vacc${M}x${ABC[N:N+4]} = vsraq_n_s32(vacc${M}x${ABC[N:N+4]}, vbicq_s32(vacc${M}x${ABC[N:N+4]}, vzero_shift_mask), 31);
+
+    $for M in range(MR):
+      $for N in range(0, NR, 4):
+        vacc${M}x${ABC[N:N+4]} = vrshlq_s32(vacc${M}x${ABC[N:N+4]}, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    $for M in range(MR):
+      $for N in range(0, NR, 8):
+        const int16x8_t vacc${M}x${ABC[N:N+8]} = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc${M}x${ABC[N:N+4]}), vacc${M}x${ABC[N+4:N+8]}), voutput_zero_point);
+    $for M in range(MR):
+      $for N in range(0, NR, 16):
+        $if N + 8 < NR:
+          int8x16_t vout${M}x${ABC[N:N+16]} = vqmovn_high_s16(vqmovn_s16(vacc${M}x${ABC[N:N+8]}), vacc${M}x${ABC[N+8:N+16]});
+        $elif M % 2 == 1:
+          int8x16_t vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vqmovn_high_s16(vqmovn_s16(vacc${M-1}x${ABC[N:N+8]}), vacc${M}x${ABC[N:N+8]});
+        $elif M + 1 == MR:
+          int8x8_t vout${M}x${ABC[N:N+8]} = vqmovn_s16(vacc${M}x${ABC[N:N+8]});
+#else
+    $for M in range(MR):
+      $for N in range(0, NR, 8):
+        const int16x8_t vacc${M}x${ABC[N:N+8]} = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc${M}x${ABC[N:N+4]}), vqmovn_s32(vacc${M}x${ABC[N+4:N+8]})), voutput_zero_point);
+
+    $for M in range(MR):
+      $for N in range(0, NR, 16):
+        $if N + 8 < NR:
+          int8x16_t vout${M}x${ABC[N:N+16]} = vcombine_s8(vqmovn_s16(vacc${M}x${ABC[N:N+8]}), vqmovn_s16(vacc${M}x${ABC[N+8:N+16]}));
+        $elif M % 2 == 1:
+          int8x16_t vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vcombine_s8(vqmovn_s16(vacc${M-1}x${ABC[N:N+8]}), vqmovn_s16(vacc${M}x${ABC[N:N+8]}));
+        $elif M + 1 == MR:
+          int8x8_t vout${M}x${ABC[N:N+8]} = vqmovn_s16(vacc${M}x${ABC[N:N+8]});
+#endif
+    $if NR == 8 and MR == 1:
+      const int8x8_t voutput_min = vld1_dup_s8(&params->neon.output_min);
+      const int8x8_t voutput_max = vld1_dup_s8(&params->neon.output_max);
+    $else:
+      const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+      const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    $for M in reversed(range(MR)):
+      $for N in range(0, NR, 16):
+        $if N + 8 < NR:
+          vout${M}x${ABC[N:N+16]} = vmaxq_s8(vout${M}x${ABC[N:N+16]}, voutput_min);
+        $elif M % 2 == 1:
+          vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vmaxq_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}, voutput_min);
+        $elif M + 1 == MR:
+          $if NR == 8 and MR == 1:
+            vout${M}x${ABC[N:N+8]} = vmax_s8(vout${M}x${ABC[N:N+8]}, voutput_min);
+          $else:
+            vout${M}x${ABC[N:N+8]} = vmax_s8(vout${M}x${ABC[N:N+8]}, vget_low_s8(voutput_min));
+
+    $for M in reversed(range(MR)):
+      $for N in range(0, NR, 16):
+        $if N + 8 < NR:
+          vout${M}x${ABC[N:N+16]} = vminq_s8(vout${M}x${ABC[N:N+16]}, voutput_max);
+        $elif M % 2 == 1:
+          vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vminq_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}, voutput_max);
+        $elif M + 1 == MR:
+          $if NR == 8 and MR == 1:
+            vout${M}x${ABC[N:N+8]} = vmin_s8(vout${M}x${ABC[N:N+8]}, voutput_max);
+          $else:
+            vout${M}x${ABC[N:N+8]} = vmin_s8(vout${M}x${ABC[N:N+8]}, vget_low_s8(voutput_max));
+
+    if (nc >= ${NR}) {
+      $for M in reversed(range(MR)):
+        $for N in range(0, NR, 16):
+          $if N + 8 < NR:
+            vst1q_s8(c${M} + ${N}, vout${M}x${ABC[N:N+16]});
+          $elif M % 2 == 1:
+            vst1_s8(c${M} + ${N}, vget_high_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}));
+            vst1_s8(c${M-1} + ${N}, vget_low_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}));
+          $elif M + 1 == MR:
+            vst1_s8(c${M} + ${N}, vout${M}x${ABC[N:N+8]});
+
+      $for M in reversed(range(MR)):
+        c${M} = (int8_t*) ((uintptr_t) c${M} + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= ${NR};
+    } else {
+      $if NR == 16:
+        $for M in range(MR):
+          $if M % 2 == 1:
+            int8x16_t vout${M-1}x01234567_${M}x01234567 = vcombine_s8(vget_low_s8(vout${M-1}x0123456789ABCDEF), vget_low_s8(vout${M}x0123456789ABCDEF));
+          $elif M + 1 == MR:
+            int8x8_t vout${M}x01234567 = vget_low_s8(vout${M}x0123456789ABCDEF);
+        if (nc & 8) {
+          $for M in reversed(range(MR)):
+            $if M % 2 == 1:
+              vst1_s8(c${M}, vget_high_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]})); c${M} += 8;
+              vst1_s8(c${M-1}, vget_low_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]})); c${M-1} += 8;
+            $elif M + 1 == MR:
+              vst1_s8(c${M}, vout${M}x${ABC[N:N+8]}); c${M} += 8;
+          $for M in reversed(range(MR)):
+            $if M % 2 == 1:
+              vout${M-1}x01234567_${M}x01234567 = vcombine_s8(vget_high_s8(vout${M-1}x0123456789ABCDEF), vget_high_s8(vout${M}x0123456789ABCDEF));
+            $elif M + 1 == MR:
+              vout${M}x01234567 = vget_high_s8(vout${M}x0123456789ABCDEF);
+        }
+      if (nc & 4) {
+        $for M in reversed(range(MR)):
+          $if M % 2 == 1:
+            vst1q_lane_u32(__builtin_assume_aligned(c${M}, 1), vreinterpretq_u32_s8(vout${M-1}x01234567_${M}x01234567), 2); c${M} += 4;
+            vst1q_lane_u32(__builtin_assume_aligned(c${M-1}, 1), vreinterpretq_u32_s8(vout${M-1}x01234567_${M}x01234567), 0); c${M-1} += 4;
+          $elif M + 1 == MR:
+            vst1_lane_u32(__builtin_assume_aligned(c${M}, 1), vreinterpret_u32_s8(vout${M}x01234567), 0); c${M} += 4;
+        $for M in reversed(range(MR)):
+          $if M % 2 == 1:
+            vout${M-1}x01234567_${M}x01234567 = vextq_s8(vout${M-1}x01234567_${M}x01234567, vout${M-1}x01234567_${M}x01234567, 4);
+          $elif M + 1 == MR:
+            vout${M}x01234567 = vext_s8(vout${M}x01234567, vout${M}x01234567, 4);
+      }
+      if (nc & 2) {
+        $for M in reversed(range(MR)):
+          $if M % 2 == 1:
+            vst1q_lane_u16(__builtin_assume_aligned(c${M}, 1), vreinterpretq_u16_s8(vout${M-1}x01234567_${M}x01234567), 4); c${M} += 2;
+            vst1q_lane_u16(__builtin_assume_aligned(c${M-1}, 1), vreinterpretq_u16_s8(vout${M-1}x01234567_${M}x01234567), 0); c${M-1} += 2;
+          $elif M + 1 == MR:
+            vst1_lane_u16(__builtin_assume_aligned(c${M}, 1), vreinterpret_u16_s8(vout${M}x01234567), 0); c${M} += 2;
+        $for M in reversed(range(MR)):
+          $if M % 2 == 1:
+            vout${M-1}x01234567_${M}x01234567 = vextq_s8(vout${M-1}x01234567_${M}x01234567, vout${M-1}x01234567_${M}x01234567, 2);
+          $elif M + 1 == MR:
+            vout${M}x01234567 = vext_s8(vout${M}x01234567, vout${M}x01234567, 2);
+      }
+      if (nc & 1) {
+        $for M in reversed(range(MR)):
+          $if M % 2 == 1:
+            vst1q_lane_s8(c${M}, vout${M-1}x01234567_${M}x01234567, 8);
+            vst1q_lane_s8(c${M-1}, vout${M-1}x01234567_${M}x01234567, 0);
+          $elif M + 1 == MR:
+            vst1_lane_s8(c${M}, vout${M}x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/c2-neon-mull-padal-dup.c.in b/src/qs8-igemm/c2-neon-mull-padal-dup.c.in
new file mode 100644
index 0000000..79e350b
--- /dev/null
+++ b/src/qs8-igemm/c2-neon-mull-padal-dup.c.in
@@ -0,0 +1,302 @@
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+$assert NR % 8 == 0
+$assert 8 <= NR <= 16
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_${MR}x${NR}c2__neon_${"mlal" if MLA else "mull"}_padal_dup(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= ${MR});
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (${MR} * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  int8_t* c0 = c;
+  $for M in range(1, MR):
+    int8_t* c${M} = (int8_t*) ((uintptr_t) c${M-1} + cm_stride);
+    $if M % 2 == 0:
+      if XNN_UNPREDICTABLE(mr <= ${M}) {
+        c${M} = c${M-1};
+      }
+    $elif M + 1 == MR:
+      if XNN_UNPREDICTABLE(mr != ${M+1}) {
+        c${M} = c${M-1};
+      }
+    $else:
+      if XNN_UNPREDICTABLE(mr < ${M+1}) {
+        c${M} = c${M-1};
+      }
+
+  do {
+    $for N in range(0, NR, 4):
+      int32x4_t vacc0x${ABC[N:N+4]} = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    $for M in range(1, MR):
+      $for N in range(0, NR, 4):
+        int32x4_t vacc${M}x${ABC[N:N+4]} = vacc0x${ABC[N:N+4]};
+
+    size_t p = ks;
+    do {
+      $for M in range(MR):
+        const int8_t* restrict a${M} = a[${M}];
+        if XNN_UNPREDICTABLE(a${M} != zero) {
+          a${M} = (const int8_t*) ((uintptr_t) a${M} + a_offset);
+        }
+      a += ${MR};
+
+      size_t k = kc;
+
+      $if MLA:
+        while (k >= 16 * sizeof(int8_t)) {
+          $for M in range(MR):
+            const int8x8_t va${M}x0 = vld1_s8(a${M}); a${M} += 8;
+            const int8x8_t va${M}x1 = vld1_s8(a${M}); a${M} += 8;
+
+          $for K in range(4):
+            $for N in range(0, NR, 4):
+              const int8x8_t vb${ABC[N:N+4]}c${K}x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          $for K in range(4):
+            $for N in range(0, NR, 4):
+              $for M in range(MR):
+                int16x8_t vprod${M}x${ABC[N:N+4]}c${K} = vmull_s8(vb${ABC[N:N+4]}c${K}x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va${M}x0), ${K})));
+              const int8x8_t vb${ABC[N:N+4]}c${K}x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+              $for M in range(MR):
+                vprod${M}x${ABC[N:N+4]}c${K} = vmlal_s8(vprod${M}x${ABC[N:N+4]}c${K}, vb${ABC[N:N+4]}c${K}x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va${M}x1), ${K})));
+              $for M in range(MR):
+                vacc${M}x${ABC[N:N+4]} = vpadalq_s16(vacc${M}x${ABC[N:N+4]}, vprod${M}x${ABC[N:N+4]}c${K});
+
+          k -= 16 * sizeof(int8_t);
+        }
+
+      ${"if" if MLA else "while"} (k >= 8 * sizeof(int8_t)) {
+        $for M in range(MR):
+          const int8x8_t va${M} = vld1_s8(a${M}); a${M} += 8;
+
+        $for K in range(4):
+          $for N in range(0, NR, 4):
+            const int8x8_t vb${ABC[N:N+4]}c${K} = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        $for M in range(MR):
+          $for N in range(0, NR, 4):
+            $for K in range(4):
+              const int16x8_t vprod${M}x${ABC[N:N+4]}c${K} = vmull_s8(vb${ABC[N:N+4]}c${K}, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va${M}), ${K})));
+            $for K in range(4):
+              vacc${M}x${ABC[N:N+4]} = vpadalq_s16(vacc${M}x${ABC[N:N+4]}, vprod${M}x${ABC[N:N+4]}c${K});
+
+        k -= 8 * sizeof(int8_t);
+      }
+
+      if XNN_UNLIKELY(k != 0) {
+        $for M in range(MR):
+          const int8x8_t va${M} = vld1_s8(a${M}); a${M} = (const int8_t*) ((uintptr_t) a${M} + k);
+
+        $for N in range(0, NR, 4):
+          const int8x8_t vb${ABC[N:N+4]}c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        $for M in range(MR):
+          $for N in range(0, NR, 4):
+            const int16x8_t vprod${M}x${ABC[N:N+4]}c0 = vmull_s8(vb${ABC[N:N+4]}c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va${M}), 0)));
+            vacc${M}x${ABC[N:N+4]} = vpadalq_s16(vacc${M}x${ABC[N:N+4]}, vprod${M}x${ABC[N:N+4]}c0);
+
+        if (k > 2 * sizeof(int8_t)) {
+          $for N in range(0, NR, 4):
+            const int8x8_t vb${ABC[N:N+4]}c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          $for M in range(MR):
+            $for N in range(0, NR, 4):
+              const int16x8_t vprod${M}x${ABC[N:N+4]}c1 = vmull_s8(vb${ABC[N:N+4]}c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va${M}), 1)));
+              vacc${M}x${ABC[N:N+4]} = vpadalq_s16(vacc${M}x${ABC[N:N+4]}, vprod${M}x${ABC[N:N+4]}c1);
+
+          if (k > 4 * sizeof(int8_t)) {
+            $for N in range(0, NR, 4):
+              const int8x8_t vb${ABC[N:N+4]}c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+            $for M in range(MR):
+              $for N in range(0, NR, 4):
+                const int16x8_t vprod${M}x${ABC[N:N+4]}c2 = vmull_s8(vb${ABC[N:N+4]}c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va${M}), 2)));
+                vacc${M}x${ABC[N:N+4]} = vpadalq_s16(vacc${M}x${ABC[N:N+4]}, vprod${M}x${ABC[N:N+4]}c2);
+          }
+        }
+      }
+      p -= ${MR} * sizeof(void*);
+    } while (p != 0);
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    $for M in range(MR):
+      $for N in range(0, NR, 4):
+        vacc${M}x${ABC[N:N+4]} = vqrdmulhq_s32(vacc${M}x${ABC[N:N+4]}, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    $for M in range(MR):
+      $for N in range(0, NR, 4):
+        vacc${M}x${ABC[N:N+4]} = vsraq_n_s32(vacc${M}x${ABC[N:N+4]}, vbicq_s32(vacc${M}x${ABC[N:N+4]}, vzero_shift_mask), 31);
+
+    $for M in range(MR):
+      $for N in range(0, NR, 4):
+        vacc${M}x${ABC[N:N+4]} = vrshlq_s32(vacc${M}x${ABC[N:N+4]}, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    $for M in range(MR):
+      $for N in range(0, NR, 8):
+        const int16x8_t vacc${M}x${ABC[N:N+8]} = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc${M}x${ABC[N:N+4]}), vacc${M}x${ABC[N+4:N+8]}), voutput_zero_point);
+
+    $for M in range(MR):
+      $for N in range(0, NR, 16):
+        $if N + 8 < NR:
+          int8x16_t vout${M}x${ABC[N:N+16]} = vqmovn_high_s16(vqmovn_s16(vacc${M}x${ABC[N:N+8]}), vacc${M}x${ABC[N+8:N+16]});
+        $elif M % 2 == 1:
+          int8x16_t vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vqmovn_high_s16(vqmovn_s16(vacc${M-1}x${ABC[N:N+8]}), vacc${M}x${ABC[N:N+8]});
+        $elif M + 1 == MR:
+          int8x8_t vout${M}x${ABC[N:N+8]} = vqmovn_s16(vacc${M}x${ABC[N:N+8]});
+#else
+    $for M in range(MR):
+      $for N in range(0, NR, 8):
+        const int16x8_t vacc${M}x${ABC[N:N+8]} = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc${M}x${ABC[N:N+4]}), vqmovn_s32(vacc${M}x${ABC[N+4:N+8]})), voutput_zero_point);
+
+    $for M in range(MR):
+      $for N in range(0, NR, 16):
+        $if N + 8 < NR:
+          int8x16_t vout${M}x${ABC[N:N+16]} = vcombine_s8(vqmovn_s16(vacc${M}x${ABC[N:N+8]}), vqmovn_s16(vacc${M}x${ABC[N+8:N+16]}));
+        $elif M % 2 == 1:
+          int8x16_t vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vcombine_s8(vqmovn_s16(vacc${M-1}x${ABC[N:N+8]}), vqmovn_s16(vacc${M}x${ABC[N:N+8]}));
+        $elif M + 1 == MR:
+          int8x8_t vout${M}x${ABC[N:N+8]} = vqmovn_s16(vacc${M}x${ABC[N:N+8]});
+#endif
+    $if NR == 8 and MR == 1:
+      const int8x8_t voutput_min = vld1_dup_s8(&params->neon.output_min);
+      const int8x8_t voutput_max = vld1_dup_s8(&params->neon.output_max);
+    $else:
+      const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+      const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    $for M in reversed(range(MR)):
+      $for N in range(0, NR, 16):
+        $if N + 8 < NR:
+          vout${M}x${ABC[N:N+16]} = vmaxq_s8(vout${M}x${ABC[N:N+16]}, voutput_min);
+        $elif M % 2 == 1:
+          vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vmaxq_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}, voutput_min);
+        $elif M + 1 == MR:
+          $if NR == 8 and MR == 1:
+            vout${M}x${ABC[N:N+8]} = vmax_s8(vout${M}x${ABC[N:N+8]}, voutput_min);
+          $else:
+            vout${M}x${ABC[N:N+8]} = vmax_s8(vout${M}x${ABC[N:N+8]}, vget_low_s8(voutput_min));
+
+    $for M in reversed(range(MR)):
+      $for N in range(0, NR, 16):
+        $if N + 8 < NR:
+          vout${M}x${ABC[N:N+16]} = vminq_s8(vout${M}x${ABC[N:N+16]}, voutput_max);
+        $elif M % 2 == 1:
+          vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vminq_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}, voutput_max);
+        $elif M + 1 == MR:
+          $if NR == 8 and MR == 1:
+            vout${M}x${ABC[N:N+8]} = vmin_s8(vout${M}x${ABC[N:N+8]}, voutput_max);
+          $else:
+            vout${M}x${ABC[N:N+8]} = vmin_s8(vout${M}x${ABC[N:N+8]}, vget_low_s8(voutput_max));
+
+    if (nc >= ${NR}) {
+      $for M in reversed(range(MR)):
+        $for N in range(0, NR, 16):
+          $if N + 8 < NR:
+            vst1q_s8(c${M} + ${N}, vout${M}x${ABC[N:N+16]});
+          $elif M % 2 == 1:
+            vst1_s8(c${M} + ${N}, vget_high_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}));
+            vst1_s8(c${M-1} + ${N}, vget_low_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}));
+          $elif M + 1 == MR:
+            vst1_s8(c${M} + ${N}, vout${M}x${ABC[N:N+8]});
+
+      $for M in reversed(range(MR)):
+        c${M} = (int8_t*) ((uintptr_t) c${M} + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= ${NR};
+    } else {
+      $if NR == 16:
+        $for M in range(MR):
+          $if M % 2 == 1:
+            int8x16_t vout${M-1}x01234567_${M}x01234567 = vcombine_s8(vget_low_s8(vout${M-1}x0123456789ABCDEF), vget_low_s8(vout${M}x0123456789ABCDEF));
+          $elif M + 1 == MR:
+            int8x8_t vout${M}x01234567 = vget_low_s8(vout${M}x0123456789ABCDEF);
+        if (nc & 8) {
+          $for M in reversed(range(MR)):
+            $if M % 2 == 1:
+              vst1_s8(c${M}, vget_high_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]})); c${M} += 8;
+              vst1_s8(c${M-1}, vget_low_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]})); c${M-1} += 8;
+            $elif M + 1 == MR:
+              vst1_s8(c${M}, vout${M}x${ABC[N:N+8]}); c${M} += 8;
+          $for M in reversed(range(MR)):
+            $if M % 2 == 1:
+              vout${M-1}x01234567_${M}x01234567 = vcombine_s8(vget_high_s8(vout${M-1}x0123456789ABCDEF), vget_high_s8(vout${M}x0123456789ABCDEF));
+            $elif M + 1 == MR:
+              vout${M}x01234567 = vget_high_s8(vout${M}x0123456789ABCDEF);
+        }
+      if (nc & 4) {
+        $for M in reversed(range(MR)):
+          $if M % 2 == 1:
+            vst1q_lane_u32(__builtin_assume_aligned(c${M}, 1), vreinterpretq_u32_s8(vout${M-1}x01234567_${M}x01234567), 2); c${M} += 4;
+            vst1q_lane_u32(__builtin_assume_aligned(c${M-1}, 1), vreinterpretq_u32_s8(vout${M-1}x01234567_${M}x01234567), 0); c${M-1} += 4;
+          $elif M + 1 == MR:
+            vst1_lane_u32(__builtin_assume_aligned(c${M}, 1), vreinterpret_u32_s8(vout${M}x01234567), 0); c${M} += 4;
+        $for M in reversed(range(MR)):
+          $if M % 2 == 1:
+            vout${M-1}x01234567_${M}x01234567 = vextq_s8(vout${M-1}x01234567_${M}x01234567, vout${M-1}x01234567_${M}x01234567, 4);
+          $elif M + 1 == MR:
+            vout${M}x01234567 = vext_s8(vout${M}x01234567, vout${M}x01234567, 4);
+      }
+      if (nc & 2) {
+        $for M in reversed(range(MR)):
+          $if M % 2 == 1:
+            vst1q_lane_u16(__builtin_assume_aligned(c${M}, 1), vreinterpretq_u16_s8(vout${M-1}x01234567_${M}x01234567), 4); c${M} += 2;
+            vst1q_lane_u16(__builtin_assume_aligned(c${M-1}, 1), vreinterpretq_u16_s8(vout${M-1}x01234567_${M}x01234567), 0); c${M-1} += 2;
+          $elif M + 1 == MR:
+            vst1_lane_u16(__builtin_assume_aligned(c${M}, 1), vreinterpret_u16_s8(vout${M}x01234567), 0); c${M} += 2;
+        $for M in reversed(range(MR)):
+          $if M % 2 == 1:
+            vout${M-1}x01234567_${M}x01234567 = vextq_s8(vout${M-1}x01234567_${M}x01234567, vout${M-1}x01234567_${M}x01234567, 2);
+          $elif M + 1 == MR:
+            vout${M}x01234567 = vext_s8(vout${M}x01234567, vout${M}x01234567, 2);
+      }
+      if (nc & 1) {
+        $for M in reversed(range(MR)):
+          $if M % 2 == 1:
+            vst1q_lane_s8(c${M}, vout${M-1}x01234567_${M}x01234567, 8);
+            vst1q_lane_s8(c${M-1}, vout${M-1}x01234567_${M}x01234567, 0);
+          $elif M + 1 == MR:
+            vst1_lane_s8(c${M}, vout${M}x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/c8-neon-mull-padal.c.in b/src/qs8-igemm/c8-neon-mull-padal.c.in
new file mode 100644
index 0000000..9318ad3
--- /dev/null
+++ b/src/qs8-igemm/c8-neon-mull-padal.c.in
@@ -0,0 +1,284 @@
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+$assert NR % 8 == 0
+$assert 8 <= NR <= 16
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_${MR}x${NR}c8__neon_${"mlal" if MLA else "mull"}_padal(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= ${MR});
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (${MR} * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  int8_t* c0 = c;
+  $for M in range(1, MR):
+    int8_t* c${M} = (int8_t*) ((uintptr_t) c${M-1} + cm_stride);
+    $if M % 2 == 0:
+      if XNN_UNPREDICTABLE(mr <= ${M}) {
+        c${M} = c${M-1};
+      }
+    $elif M + 1 == MR:
+      if XNN_UNPREDICTABLE(mr != ${M+1}) {
+        c${M} = c${M-1};
+      }
+    $else:
+      if XNN_UNPREDICTABLE(mr < ${M+1}) {
+        c${M} = c${M-1};
+      }
+
+  do {
+    $for N in range(NR):
+      int32x4_t vacc0x${N} = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    $for M in range(1, MR):
+      $for N in range(NR):
+        int32x4_t vacc${M}x${N} = vacc0x${N};
+
+    size_t p = ks;
+    do {
+      $for M in range(MR):
+        const int8_t* restrict a${M} = a[${M}];
+        if XNN_UNPREDICTABLE(a${M} != zero) {
+          a${M} = (const int8_t*) ((uintptr_t) a${M} + a_offset);
+        }
+      a += ${MR};
+
+      size_t k = kc;
+      $if MLA:
+        // 2x partial unrolled loop to load 16 bytes at a time using MLA.
+        while (k >= 16 * sizeof(int8_t)) {
+          $for M in range(MR):
+            const int8x8_t va${M}x0 = vld1_s8(a${M}); a${M} += 8;
+            const int8x8_t va${M}x1 = vld1_s8(a${M}); a${M} += 8;
+
+          $for N in range(NR):
+            const int8x8_t vb${N}x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+
+          $for N in range(NR):
+            const int8x8_t vb${N}x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+            $for M in range(MR):
+              int16x8_t vprod${M}x${N} = vmull_s8(vb${N}x0, va${M}x0);
+            $for M in range(MR):
+              vprod${M}x${N} = vmlal_s8(vprod${M}x${N}, vb${N}x1, va${M}x1);
+            $for M in range(MR):
+              vacc${M}x${N} = vpadalq_s16(vacc${M}x${N}, vprod${M}x${N});
+
+          k -= 16 * sizeof(int8_t);
+        }
+
+      // Handle 8 bytes at a time using MUL.
+      ${"if" if MLA else "while"} (k > 0) {
+        $for M in range(MR):
+          const int8x8_t va${M} = vld1_s8(a${M}); a${M} += 8;
+
+        $for N in range(NR):
+          const int8x8_t vb${N} = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          $for M in range(MR):
+            const int16x8_t vprod${M}x${N} = vmull_s8(vb${N}, va${M});
+          $for M in range(MR):
+            vacc${M}x${N} = vpadalq_s16(vacc${M}x${N}, vprod${M}x${N});
+
+        k -= 8 * sizeof(int8_t);
+      }
+
+      p -= ${MR} * sizeof(void*);
+    } while (p != 0);
+
+#if XNN_ARCH_ARM64
+    $for M in range(MR):
+      $for N in range(0, NR, 4):
+        const int32x4_t vsum${M}x${ABC[N:N+2]} = vpaddq_s32(vacc${M}x${N}, vacc${M}x${N+1});
+        const int32x4_t vsum${M}x${ABC[N+2:N+4]} = vpaddq_s32(vacc${M}x${N+2}, vacc${M}x${N+3});
+    $for M in range(MR):
+      $for N in range(0, NR, 4):
+        int32x4_t vacc${M}x${ABC[N:N+4]} = vpaddq_s32(vsum${M}x${ABC[N:N+2]}, vsum${M}x${ABC[N+2:N+4]});
+#else
+    $for M in range(MR):
+      $for N in range(0, NR, 4):
+        const int32x2_t vpsum${M}x${ABC[N]} = vadd_s32(vget_low_s32(vacc${M}x${N}), vget_high_s32(vacc${M}x${N}));
+        const int32x2_t vpsum${M}x${ABC[N+1]} = vadd_s32(vget_low_s32(vacc${M}x${N+1}), vget_high_s32(vacc${M}x${N+1}));
+        const int32x2_t vpsum${M}x${ABC[N+2]} = vadd_s32(vget_low_s32(vacc${M}x${N+2}), vget_high_s32(vacc${M}x${N+2}));
+        const int32x2_t vpsum${M}x${ABC[N+3]} = vadd_s32(vget_low_s32(vacc${M}x${N+3}), vget_high_s32(vacc${M}x${N+3}));
+        const int32x2_t vsum${M}x${ABC[N:N+2]} = vpadd_s32(vpsum${M}x${ABC[N]}, vpsum${M}x${ABC[N+1]});
+        const int32x2_t vsum${M}x${ABC[N+2:N+4]} = vpadd_s32(vpsum${M}x${ABC[N+2]}, vpsum${M}x${ABC[N+3]});
+        int32x4_t vacc${M}x${ABC[N:N+4]} = vcombine_s32(vsum${M}x${ABC[N:N+2]}, vsum${M}x${ABC[N+2:N+4]} );
+#endif
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    $for M in range(MR):
+      $for N in range(0, NR, 4):
+        vacc${M}x${ABC[N:N+4]} = vqrdmulhq_s32(vacc${M}x${ABC[N:N+4]}, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    $for M in range(MR):
+      $for N in range(0, NR, 4):
+        vacc${M}x${ABC[N:N+4]} = vsraq_n_s32(vacc${M}x${ABC[N:N+4]}, vbicq_s32(vacc${M}x${ABC[N:N+4]}, vzero_shift_mask), 31);
+
+    $for M in range(MR):
+      $for N in range(0, NR, 4):
+        vacc${M}x${ABC[N:N+4]} = vrshlq_s32(vacc${M}x${ABC[N:N+4]}, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    $for M in range(MR):
+      $for N in range(0, NR, 8):
+        const int16x8_t vacc${M}x${ABC[N:N+8]} = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc${M}x${ABC[N:N+4]}), vacc${M}x${ABC[N+4:N+8]}), voutput_zero_point);
+    $for M in range(MR):
+      $for N in range(0, NR, 16):
+        $if N + 8 < NR:
+          int8x16_t vout${M}x${ABC[N:N+16]} = vqmovn_high_s16(vqmovn_s16(vacc${M}x${ABC[N:N+8]}), vacc${M}x${ABC[N+8:N+16]});
+        $elif M % 2 == 1:
+          int8x16_t vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vqmovn_high_s16(vqmovn_s16(vacc${M-1}x${ABC[N:N+8]}), vacc${M}x${ABC[N:N+8]});
+        $elif M + 1 == MR:
+          int8x8_t vout${M}x${ABC[N:N+8]} = vqmovn_s16(vacc${M}x${ABC[N:N+8]});
+#else
+    $for M in range(MR):
+      $for N in range(0, NR, 8):
+        const int16x8_t vacc${M}x${ABC[N:N+8]} = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc${M}x${ABC[N:N+4]}), vqmovn_s32(vacc${M}x${ABC[N+4:N+8]})), voutput_zero_point);
+
+    $for M in range(MR):
+      $for N in range(0, NR, 16):
+        $if N + 8 < NR:
+          int8x16_t vout${M}x${ABC[N:N+16]} = vcombine_s8(vqmovn_s16(vacc${M}x${ABC[N:N+8]}), vqmovn_s16(vacc${M}x${ABC[N+8:N+16]}));
+        $elif M % 2 == 1:
+          int8x16_t vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vcombine_s8(vqmovn_s16(vacc${M-1}x${ABC[N:N+8]}), vqmovn_s16(vacc${M}x${ABC[N:N+8]}));
+        $elif M + 1 == MR:
+          int8x8_t vout${M}x${ABC[N:N+8]} = vqmovn_s16(vacc${M}x${ABC[N:N+8]});
+#endif
+    $if NR == 8 and MR == 1:
+      const int8x8_t voutput_min = vld1_dup_s8(&params->neon.output_min);
+      const int8x8_t voutput_max = vld1_dup_s8(&params->neon.output_max);
+    $else:
+      const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+      const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    $for M in reversed(range(MR)):
+      $for N in range(0, NR, 16):
+        $if N + 8 < NR:
+          vout${M}x${ABC[N:N+16]} = vmaxq_s8(vout${M}x${ABC[N:N+16]}, voutput_min);
+        $elif M % 2 == 1:
+          vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vmaxq_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}, voutput_min);
+        $elif M + 1 == MR:
+          $if NR == 8 and MR == 1:
+            vout${M}x${ABC[N:N+8]} = vmax_s8(vout${M}x${ABC[N:N+8]}, voutput_min);
+          $else:
+            vout${M}x${ABC[N:N+8]} = vmax_s8(vout${M}x${ABC[N:N+8]}, vget_low_s8(voutput_min));
+
+    $for M in reversed(range(MR)):
+      $for N in range(0, NR, 16):
+        $if N + 8 < NR:
+          vout${M}x${ABC[N:N+16]} = vminq_s8(vout${M}x${ABC[N:N+16]}, voutput_max);
+        $elif M % 2 == 1:
+          vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vminq_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}, voutput_max);
+        $elif M + 1 == MR:
+          $if NR == 8 and MR == 1:
+            vout${M}x${ABC[N:N+8]} = vmin_s8(vout${M}x${ABC[N:N+8]}, voutput_max);
+          $else:
+            vout${M}x${ABC[N:N+8]} = vmin_s8(vout${M}x${ABC[N:N+8]}, vget_low_s8(voutput_max));
+
+    if (nc >= ${NR}) {
+      $for M in reversed(range(MR)):
+        $for N in range(0, NR, 16):
+          $if N + 8 < NR:
+            vst1q_s8(c${M} + ${N}, vout${M}x${ABC[N:N+16]});
+          $elif M % 2 == 1:
+            vst1_s8(c${M} + ${N}, vget_high_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}));
+            vst1_s8(c${M-1} + ${N}, vget_low_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}));
+          $elif M + 1 == MR:
+            vst1_s8(c${M} + ${N}, vout${M}x${ABC[N:N+8]});
+
+      $for M in reversed(range(MR)):
+        c${M} = (int8_t*) ((uintptr_t) c${M} + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= ${NR};
+    } else {
+      $if NR == 16:
+        $for M in range(MR):
+          $if M % 2 == 1:
+            int8x16_t vout${M-1}x01234567_${M}x01234567 = vcombine_s8(vget_low_s8(vout${M-1}x0123456789ABCDEF), vget_low_s8(vout${M}x0123456789ABCDEF));
+          $elif M + 1 == MR:
+            int8x8_t vout${M}x01234567 = vget_low_s8(vout${M}x0123456789ABCDEF);
+        if (nc & 8) {
+          $for M in reversed(range(MR)):
+            $if M % 2 == 1:
+              vst1_s8(c${M}, vget_high_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]})); c${M} += 8;
+              vst1_s8(c${M-1}, vget_low_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]})); c${M-1} += 8;
+            $elif M + 1 == MR:
+              vst1_s8(c${M}, vout${M}x${ABC[N:N+8]}); c${M} += 8;
+          $for M in reversed(range(MR)):
+            $if M % 2 == 1:
+              vout${M-1}x01234567_${M}x01234567 = vcombine_s8(vget_high_s8(vout${M-1}x0123456789ABCDEF), vget_high_s8(vout${M}x0123456789ABCDEF));
+            $elif M + 1 == MR:
+              vout${M}x01234567 = vget_high_s8(vout${M}x0123456789ABCDEF);
+        }
+      if (nc & 4) {
+        $for M in reversed(range(MR)):
+          $if M % 2 == 1:
+            vst1q_lane_u32(__builtin_assume_aligned(c${M}, 1), vreinterpretq_u32_s8(vout${M-1}x01234567_${M}x01234567), 2); c${M} += 4;
+            vst1q_lane_u32(__builtin_assume_aligned(c${M-1}, 1), vreinterpretq_u32_s8(vout${M-1}x01234567_${M}x01234567), 0); c${M-1} += 4;
+          $elif M + 1 == MR:
+            vst1_lane_u32(__builtin_assume_aligned(c${M}, 1), vreinterpret_u32_s8(vout${M}x01234567), 0); c${M} += 4;
+        $for M in reversed(range(MR)):
+          $if M % 2 == 1:
+            vout${M-1}x01234567_${M}x01234567 = vextq_s8(vout${M-1}x01234567_${M}x01234567, vout${M-1}x01234567_${M}x01234567, 4);
+          $elif M + 1 == MR:
+            vout${M}x01234567 = vext_s8(vout${M}x01234567, vout${M}x01234567, 4);
+      }
+      if (nc & 2) {
+        $for M in reversed(range(MR)):
+          $if M % 2 == 1:
+            vst1q_lane_u16(__builtin_assume_aligned(c${M}, 1), vreinterpretq_u16_s8(vout${M-1}x01234567_${M}x01234567), 4); c${M} += 2;
+            vst1q_lane_u16(__builtin_assume_aligned(c${M-1}, 1), vreinterpretq_u16_s8(vout${M-1}x01234567_${M}x01234567), 0); c${M-1} += 2;
+          $elif M + 1 == MR:
+            vst1_lane_u16(__builtin_assume_aligned(c${M}, 1), vreinterpret_u16_s8(vout${M}x01234567), 0); c${M} += 2;
+        $for M in reversed(range(MR)):
+          $if M % 2 == 1:
+            vout${M-1}x01234567_${M}x01234567 = vextq_s8(vout${M-1}x01234567_${M}x01234567, vout${M-1}x01234567_${M}x01234567, 2);
+          $elif M + 1 == MR:
+            vout${M}x01234567 = vext_s8(vout${M}x01234567, vout${M}x01234567, 2);
+      }
+      if (nc & 1) {
+        $for M in reversed(range(MR)):
+          $if M % 2 == 1:
+            vst1q_lane_s8(c${M}, vout${M-1}x01234567_${M}x01234567, 8);
+            vst1q_lane_s8(c${M-1}, vout${M-1}x01234567_${M}x01234567, 0);
+          $elif M + 1 == MR:
+            vst1_lane_s8(c${M}, vout${M}x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/12x8c4-minmax-neondot.c b/src/qs8-igemm/gen/12x8c4-minmax-neondot.c
deleted file mode 100644
index 4ecc1e7..0000000
--- a/src/qs8-igemm/gen/12x8c4-minmax-neondot.c
+++ /dev/null
@@ -1,550 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/qs8-igemm/MRxNRc4-neondot.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include <xnnpack/common.h>
-#include <xnnpack/igemm.h>
-
-
-void xnn_qs8_igemm_minmax_ukernel_12x8c4__neondot(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    size_t ks,
-    const int8_t** restrict a,
-    const void* restrict w,
-    int8_t* restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    size_t a_offset,
-    const int8_t* zero,
-    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
-{
-  assert(mr != 0);
-  assert(mr <= 12);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(ks != 0);
-  assert(ks % (12 * sizeof(void*)) == 0);
-  assert(a_offset % sizeof(int8_t) == 0);
-  assert(a != NULL);
-  assert(w != NULL);
-  assert(c != NULL);
-
-  int8_t* c0 = c;
-  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 2) {
-    c1 = c0;
-  }
-  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 2) {
-    c2 = c1;
-  }
-  int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 4) {
-    c3 = c2;
-  }
-  int8_t* c4 = (int8_t*) ((uintptr_t) c3 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 4) {
-    c4 = c3;
-  }
-  int8_t* c5 = (int8_t*) ((uintptr_t) c4 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 6) {
-    c5 = c4;
-  }
-  int8_t* c6 = (int8_t*) ((uintptr_t) c5 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 6) {
-    c6 = c5;
-  }
-  int8_t* c7 = (int8_t*) ((uintptr_t) c6 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 8) {
-    c7 = c6;
-  }
-  int8_t* c8 = (int8_t*) ((uintptr_t) c7 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 8) {
-    c8 = c7;
-  }
-  int8_t* c9 = (int8_t*) ((uintptr_t) c8 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 10) {
-    c9 = c8;
-  }
-  int8_t* c10 = (int8_t*) ((uintptr_t) c9 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 10) {
-    c10 = c9;
-  }
-  int8_t* c11 = (int8_t*) ((uintptr_t) c10 + cm_stride);
-  if XNN_UNPREDICTABLE(mr != 12) {
-    c11 = c10;
-  }
-
-  do {
-    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
-    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
-    int32x4_t vacc1x0123 = vacc0x0123;
-    int32x4_t vacc1x4567 = vacc0x4567;
-    int32x4_t vacc2x0123 = vacc0x0123;
-    int32x4_t vacc2x4567 = vacc0x4567;
-    int32x4_t vacc3x0123 = vacc0x0123;
-    int32x4_t vacc3x4567 = vacc0x4567;
-    int32x4_t vacc4x0123 = vacc0x0123;
-    int32x4_t vacc4x4567 = vacc0x4567;
-    int32x4_t vacc5x0123 = vacc0x0123;
-    int32x4_t vacc5x4567 = vacc0x4567;
-    int32x4_t vacc6x0123 = vacc0x0123;
-    int32x4_t vacc6x4567 = vacc0x4567;
-    int32x4_t vacc7x0123 = vacc0x0123;
-    int32x4_t vacc7x4567 = vacc0x4567;
-    int32x4_t vacc8x0123 = vacc0x0123;
-    int32x4_t vacc8x4567 = vacc0x4567;
-    int32x4_t vacc9x0123 = vacc0x0123;
-    int32x4_t vacc9x4567 = vacc0x4567;
-    int32x4_t vacc10x0123 = vacc0x0123;
-    int32x4_t vacc10x4567 = vacc0x4567;
-    int32x4_t vacc11x0123 = vacc0x0123;
-    int32x4_t vacc11x4567 = vacc0x4567;
-
-    size_t p = ks;
-    do {
-      const int8_t* restrict a0 = a[0];
-      if XNN_UNPREDICTABLE(a0 != zero) {
-        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
-      }
-      const int8_t* restrict a1 = a[1];
-      if XNN_UNPREDICTABLE(a1 != zero) {
-        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
-      }
-      const int8_t* restrict a2 = a[2];
-      if XNN_UNPREDICTABLE(a2 != zero) {
-        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
-      }
-      const int8_t* restrict a3 = a[3];
-      if XNN_UNPREDICTABLE(a3 != zero) {
-        a3 = (const int8_t*) ((uintptr_t) a3 + a_offset);
-      }
-      const int8_t* restrict a4 = a[4];
-      if XNN_UNPREDICTABLE(a4 != zero) {
-        a4 = (const int8_t*) ((uintptr_t) a4 + a_offset);
-      }
-      const int8_t* restrict a5 = a[5];
-      if XNN_UNPREDICTABLE(a5 != zero) {
-        a5 = (const int8_t*) ((uintptr_t) a5 + a_offset);
-      }
-      const int8_t* restrict a6 = a[6];
-      if XNN_UNPREDICTABLE(a6 != zero) {
-        a6 = (const int8_t*) ((uintptr_t) a6 + a_offset);
-      }
-      const int8_t* restrict a7 = a[7];
-      if XNN_UNPREDICTABLE(a7 != zero) {
-        a7 = (const int8_t*) ((uintptr_t) a7 + a_offset);
-      }
-      const int8_t* restrict a8 = a[8];
-      if XNN_UNPREDICTABLE(a8 != zero) {
-        a8 = (const int8_t*) ((uintptr_t) a8 + a_offset);
-      }
-      const int8_t* restrict a9 = a[9];
-      if XNN_UNPREDICTABLE(a9 != zero) {
-        a9 = (const int8_t*) ((uintptr_t) a9 + a_offset);
-      }
-      const int8_t* restrict a10 = a[10];
-      if XNN_UNPREDICTABLE(a10 != zero) {
-        a10 = (const int8_t*) ((uintptr_t) a10 + a_offset);
-      }
-      const int8_t* restrict a11 = a[11];
-      if XNN_UNPREDICTABLE(a11 != zero) {
-        a11 = (const int8_t*) ((uintptr_t) a11 + a_offset);
-      }
-      a += 12;
-
-      // Inner accumulation loop along the 8 columns.
-      size_t k = kc;
-      // 2x partial unrolled loop to load 8 bytes at a time.
-      while (k >= 8 * sizeof(int8_t)) {
-        // Load a 12x8 block of activations.
-        const int8x8_t va0x01234567 = vld1_s8(a0); a0 += 8;
-        const int8x8_t va1x01234567 = vld1_s8(a1); a1 += 8;
-        const int8x8_t va2x01234567 = vld1_s8(a2); a2 += 8;
-        const int8x8_t va3x01234567 = vld1_s8(a3); a3 += 8;
-        const int8x8_t va4x01234567 = vld1_s8(a4); a4 += 8;
-        const int8x8_t va5x01234567 = vld1_s8(a5); a5 += 8;
-        const int8x8_t va6x01234567 = vld1_s8(a6); a6 += 8;
-        const int8x8_t va7x01234567 = vld1_s8(a7); a7 += 8;
-        const int8x8_t va8x01234567 = vld1_s8(a8); a8 += 8;
-        const int8x8_t va9x01234567 = vld1_s8(a9); a9 += 8;
-        const int8x8_t va10x01234567 = vld1_s8(a10); a10 += 8;
-        const int8x8_t va11x01234567 = vld1_s8(a11); a11 += 8;
-
-        // Load a 8x8 block of weights.
-        const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-
-        // Multiply-accumulate: 12x8 * 8x8 --> 12x8.
-        vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb0123x0123, va0x01234567, 0);
-        vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb0123x4567, va0x01234567, 0);
-        vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb0123x0123, va1x01234567, 0);
-        vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb0123x4567, va1x01234567, 0);
-        vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb0123x0123, va2x01234567, 0);
-        vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb0123x4567, va2x01234567, 0);
-        vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb0123x0123, va3x01234567, 0);
-        vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb0123x4567, va3x01234567, 0);
-        vacc4x0123 = vdotq_lane_s32(vacc4x0123, vb0123x0123, va4x01234567, 0);
-        vacc4x4567 = vdotq_lane_s32(vacc4x4567, vb0123x4567, va4x01234567, 0);
-        vacc5x0123 = vdotq_lane_s32(vacc5x0123, vb0123x0123, va5x01234567, 0);
-        vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb0123x4567, va5x01234567, 0);
-        vacc6x0123 = vdotq_lane_s32(vacc6x0123, vb0123x0123, va6x01234567, 0);
-        vacc6x4567 = vdotq_lane_s32(vacc6x4567, vb0123x4567, va6x01234567, 0);
-        vacc7x0123 = vdotq_lane_s32(vacc7x0123, vb0123x0123, va7x01234567, 0);
-        vacc7x4567 = vdotq_lane_s32(vacc7x4567, vb0123x4567, va7x01234567, 0);
-        vacc8x0123 = vdotq_lane_s32(vacc8x0123, vb0123x0123, va8x01234567, 0);
-        vacc8x4567 = vdotq_lane_s32(vacc8x4567, vb0123x4567, va8x01234567, 0);
-        vacc9x0123 = vdotq_lane_s32(vacc9x0123, vb0123x0123, va9x01234567, 0);
-        vacc9x4567 = vdotq_lane_s32(vacc9x4567, vb0123x4567, va9x01234567, 0);
-        vacc10x0123 = vdotq_lane_s32(vacc10x0123, vb0123x0123, va10x01234567, 0);
-        vacc10x4567 = vdotq_lane_s32(vacc10x4567, vb0123x4567, va10x01234567, 0);
-        vacc11x0123 = vdotq_lane_s32(vacc11x0123, vb0123x0123, va11x01234567, 0);
-        vacc11x4567 = vdotq_lane_s32(vacc11x4567, vb0123x4567, va11x01234567, 0);
-        vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
-        vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
-        vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1);
-        vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1);
-        vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1);
-        vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1);
-        vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb4567x0123, va3x01234567, 1);
-        vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb4567x4567, va3x01234567, 1);
-        vacc4x0123 = vdotq_lane_s32(vacc4x0123, vb4567x0123, va4x01234567, 1);
-        vacc4x4567 = vdotq_lane_s32(vacc4x4567, vb4567x4567, va4x01234567, 1);
-        vacc5x0123 = vdotq_lane_s32(vacc5x0123, vb4567x0123, va5x01234567, 1);
-        vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb4567x4567, va5x01234567, 1);
-        vacc6x0123 = vdotq_lane_s32(vacc6x0123, vb4567x0123, va6x01234567, 1);
-        vacc6x4567 = vdotq_lane_s32(vacc6x4567, vb4567x4567, va6x01234567, 1);
-        vacc7x0123 = vdotq_lane_s32(vacc7x0123, vb4567x0123, va7x01234567, 1);
-        vacc7x4567 = vdotq_lane_s32(vacc7x4567, vb4567x4567, va7x01234567, 1);
-        vacc8x0123 = vdotq_lane_s32(vacc8x0123, vb4567x0123, va8x01234567, 1);
-        vacc8x4567 = vdotq_lane_s32(vacc8x4567, vb4567x4567, va8x01234567, 1);
-        vacc9x0123 = vdotq_lane_s32(vacc9x0123, vb4567x0123, va9x01234567, 1);
-        vacc9x4567 = vdotq_lane_s32(vacc9x4567, vb4567x4567, va9x01234567, 1);
-        vacc10x0123 = vdotq_lane_s32(vacc10x0123, vb4567x0123, va10x01234567, 1);
-        vacc10x4567 = vdotq_lane_s32(vacc10x4567, vb4567x4567, va10x01234567, 1);
-        vacc11x0123 = vdotq_lane_s32(vacc11x0123, vb4567x0123, va11x01234567, 1);
-        vacc11x4567 = vdotq_lane_s32(vacc11x4567, vb4567x4567, va11x01234567, 1);
-
-        k -= 8 * sizeof(int8_t);
-      }
-      // Handle up to 7 final positions of `k`
-      if XNN_UNLIKELY(k != 0) {
-        // Load a 12x4 block of activations.
-        const int8x8_t va0x01234567 = vld1_s8(a0);
-        const int8x8_t va1x01234567 = vld1_s8(a1);
-        const int8x8_t va2x01234567 = vld1_s8(a2);
-        const int8x8_t va3x01234567 = vld1_s8(a3);
-        const int8x8_t va4x01234567 = vld1_s8(a4);
-        const int8x8_t va5x01234567 = vld1_s8(a5);
-        const int8x8_t va6x01234567 = vld1_s8(a6);
-        const int8x8_t va7x01234567 = vld1_s8(a7);
-        const int8x8_t va8x01234567 = vld1_s8(a8);
-        const int8x8_t va9x01234567 = vld1_s8(a9);
-        const int8x8_t va10x01234567 = vld1_s8(a10);
-        const int8x8_t va11x01234567 = vld1_s8(a11);
-
-        // Load a 4x8 block of weights.
-        const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-
-        // Multiply-accumulate: 12x4 * 4x8 --> 12x8.
-        vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb0123x0123, va0x01234567, 0);
-        vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb0123x4567, va0x01234567, 0);
-        vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb0123x0123, va1x01234567, 0);
-        vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb0123x4567, va1x01234567, 0);
-        vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb0123x0123, va2x01234567, 0);
-        vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb0123x4567, va2x01234567, 0);
-        vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb0123x0123, va3x01234567, 0);
-        vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb0123x4567, va3x01234567, 0);
-        vacc4x0123 = vdotq_lane_s32(vacc4x0123, vb0123x0123, va4x01234567, 0);
-        vacc4x4567 = vdotq_lane_s32(vacc4x4567, vb0123x4567, va4x01234567, 0);
-        vacc5x0123 = vdotq_lane_s32(vacc5x0123, vb0123x0123, va5x01234567, 0);
-        vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb0123x4567, va5x01234567, 0);
-        vacc6x0123 = vdotq_lane_s32(vacc6x0123, vb0123x0123, va6x01234567, 0);
-        vacc6x4567 = vdotq_lane_s32(vacc6x4567, vb0123x4567, va6x01234567, 0);
-        vacc7x0123 = vdotq_lane_s32(vacc7x0123, vb0123x0123, va7x01234567, 0);
-        vacc7x4567 = vdotq_lane_s32(vacc7x4567, vb0123x4567, va7x01234567, 0);
-        vacc8x0123 = vdotq_lane_s32(vacc8x0123, vb0123x0123, va8x01234567, 0);
-        vacc8x4567 = vdotq_lane_s32(vacc8x4567, vb0123x4567, va8x01234567, 0);
-        vacc9x0123 = vdotq_lane_s32(vacc9x0123, vb0123x0123, va9x01234567, 0);
-        vacc9x4567 = vdotq_lane_s32(vacc9x4567, vb0123x4567, va9x01234567, 0);
-        vacc10x0123 = vdotq_lane_s32(vacc10x0123, vb0123x0123, va10x01234567, 0);
-        vacc10x4567 = vdotq_lane_s32(vacc10x4567, vb0123x4567, va10x01234567, 0);
-        vacc11x0123 = vdotq_lane_s32(vacc11x0123, vb0123x0123, va11x01234567, 0);
-        vacc11x4567 = vdotq_lane_s32(vacc11x4567, vb0123x4567, va11x01234567, 0);
-
-        if (k > 4) {
-          // Load a 4x8 block of weights.
-          const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-          const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-
-          // Multiply-accumulate: 12x4 * 4x8 --> 12x8.
-          vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
-          vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
-          vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1);
-          vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1);
-          vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1);
-          vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1);
-          vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb4567x0123, va3x01234567, 1);
-          vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb4567x4567, va3x01234567, 1);
-          vacc4x0123 = vdotq_lane_s32(vacc4x0123, vb4567x0123, va4x01234567, 1);
-          vacc4x4567 = vdotq_lane_s32(vacc4x4567, vb4567x4567, va4x01234567, 1);
-          vacc5x0123 = vdotq_lane_s32(vacc5x0123, vb4567x0123, va5x01234567, 1);
-          vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb4567x4567, va5x01234567, 1);
-          vacc6x0123 = vdotq_lane_s32(vacc6x0123, vb4567x0123, va6x01234567, 1);
-          vacc6x4567 = vdotq_lane_s32(vacc6x4567, vb4567x4567, va6x01234567, 1);
-          vacc7x0123 = vdotq_lane_s32(vacc7x0123, vb4567x0123, va7x01234567, 1);
-          vacc7x4567 = vdotq_lane_s32(vacc7x4567, vb4567x4567, va7x01234567, 1);
-          vacc8x0123 = vdotq_lane_s32(vacc8x0123, vb4567x0123, va8x01234567, 1);
-          vacc8x4567 = vdotq_lane_s32(vacc8x4567, vb4567x4567, va8x01234567, 1);
-          vacc9x0123 = vdotq_lane_s32(vacc9x0123, vb4567x0123, va9x01234567, 1);
-          vacc9x4567 = vdotq_lane_s32(vacc9x4567, vb4567x4567, va9x01234567, 1);
-          vacc10x0123 = vdotq_lane_s32(vacc10x0123, vb4567x0123, va10x01234567, 1);
-          vacc10x4567 = vdotq_lane_s32(vacc10x4567, vb4567x4567, va10x01234567, 1);
-          vacc11x0123 = vdotq_lane_s32(vacc11x0123, vb4567x0123, va11x01234567, 1);
-          vacc11x4567 = vdotq_lane_s32(vacc11x4567, vb4567x4567, va11x01234567, 1);
-        }
-      }
-      p -= 12 * sizeof(void*);
-    } while (p != 0);
-
-    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
-    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
-    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
-    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
-    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
-    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
-    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
-    vacc3x0123 = vqrdmulhq_s32(vacc3x0123, vmultiplier);
-    vacc3x4567 = vqrdmulhq_s32(vacc3x4567, vmultiplier);
-    vacc4x0123 = vqrdmulhq_s32(vacc4x0123, vmultiplier);
-    vacc4x4567 = vqrdmulhq_s32(vacc4x4567, vmultiplier);
-    vacc5x0123 = vqrdmulhq_s32(vacc5x0123, vmultiplier);
-    vacc5x4567 = vqrdmulhq_s32(vacc5x4567, vmultiplier);
-    vacc6x0123 = vqrdmulhq_s32(vacc6x0123, vmultiplier);
-    vacc6x4567 = vqrdmulhq_s32(vacc6x4567, vmultiplier);
-    vacc7x0123 = vqrdmulhq_s32(vacc7x0123, vmultiplier);
-    vacc7x4567 = vqrdmulhq_s32(vacc7x4567, vmultiplier);
-    vacc8x0123 = vqrdmulhq_s32(vacc8x0123, vmultiplier);
-    vacc8x4567 = vqrdmulhq_s32(vacc8x4567, vmultiplier);
-    vacc9x0123 = vqrdmulhq_s32(vacc9x0123, vmultiplier);
-    vacc9x4567 = vqrdmulhq_s32(vacc9x4567, vmultiplier);
-    vacc10x0123 = vqrdmulhq_s32(vacc10x0123, vmultiplier);
-    vacc10x4567 = vqrdmulhq_s32(vacc10x4567, vmultiplier);
-    vacc11x0123 = vqrdmulhq_s32(vacc11x0123, vmultiplier);
-    vacc11x4567 = vqrdmulhq_s32(vacc11x4567, vmultiplier);
-
-    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
-    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
-    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
-    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
-    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
-    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
-    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
-    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
-    vacc3x0123 = vsraq_n_s32(vacc3x0123, vbicq_s32(vacc3x0123, vzero_shift_mask), 31);
-    vacc3x4567 = vsraq_n_s32(vacc3x4567, vbicq_s32(vacc3x4567, vzero_shift_mask), 31);
-    vacc4x0123 = vsraq_n_s32(vacc4x0123, vbicq_s32(vacc4x0123, vzero_shift_mask), 31);
-    vacc4x4567 = vsraq_n_s32(vacc4x4567, vbicq_s32(vacc4x4567, vzero_shift_mask), 31);
-    vacc5x0123 = vsraq_n_s32(vacc5x0123, vbicq_s32(vacc5x0123, vzero_shift_mask), 31);
-    vacc5x4567 = vsraq_n_s32(vacc5x4567, vbicq_s32(vacc5x4567, vzero_shift_mask), 31);
-    vacc6x0123 = vsraq_n_s32(vacc6x0123, vbicq_s32(vacc6x0123, vzero_shift_mask), 31);
-    vacc6x4567 = vsraq_n_s32(vacc6x4567, vbicq_s32(vacc6x4567, vzero_shift_mask), 31);
-    vacc7x0123 = vsraq_n_s32(vacc7x0123, vbicq_s32(vacc7x0123, vzero_shift_mask), 31);
-    vacc7x4567 = vsraq_n_s32(vacc7x4567, vbicq_s32(vacc7x4567, vzero_shift_mask), 31);
-    vacc8x0123 = vsraq_n_s32(vacc8x0123, vbicq_s32(vacc8x0123, vzero_shift_mask), 31);
-    vacc8x4567 = vsraq_n_s32(vacc8x4567, vbicq_s32(vacc8x4567, vzero_shift_mask), 31);
-    vacc9x0123 = vsraq_n_s32(vacc9x0123, vbicq_s32(vacc9x0123, vzero_shift_mask), 31);
-    vacc9x4567 = vsraq_n_s32(vacc9x4567, vbicq_s32(vacc9x4567, vzero_shift_mask), 31);
-    vacc10x0123 = vsraq_n_s32(vacc10x0123, vbicq_s32(vacc10x0123, vzero_shift_mask), 31);
-    vacc10x4567 = vsraq_n_s32(vacc10x4567, vbicq_s32(vacc10x4567, vzero_shift_mask), 31);
-    vacc11x0123 = vsraq_n_s32(vacc11x0123, vbicq_s32(vacc11x0123, vzero_shift_mask), 31);
-    vacc11x4567 = vsraq_n_s32(vacc11x4567, vbicq_s32(vacc11x4567, vzero_shift_mask), 31);
-
-    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
-    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
-    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
-    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
-    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
-    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
-    vacc3x0123 = vrshlq_s32(vacc3x0123, vright_shift);
-    vacc3x4567 = vrshlq_s32(vacc3x4567, vright_shift);
-    vacc4x0123 = vrshlq_s32(vacc4x0123, vright_shift);
-    vacc4x4567 = vrshlq_s32(vacc4x4567, vright_shift);
-    vacc5x0123 = vrshlq_s32(vacc5x0123, vright_shift);
-    vacc5x4567 = vrshlq_s32(vacc5x4567, vright_shift);
-    vacc6x0123 = vrshlq_s32(vacc6x0123, vright_shift);
-    vacc6x4567 = vrshlq_s32(vacc6x4567, vright_shift);
-    vacc7x0123 = vrshlq_s32(vacc7x0123, vright_shift);
-    vacc7x4567 = vrshlq_s32(vacc7x4567, vright_shift);
-    vacc8x0123 = vrshlq_s32(vacc8x0123, vright_shift);
-    vacc8x4567 = vrshlq_s32(vacc8x4567, vright_shift);
-    vacc9x0123 = vrshlq_s32(vacc9x0123, vright_shift);
-    vacc9x4567 = vrshlq_s32(vacc9x4567, vright_shift);
-    vacc10x0123 = vrshlq_s32(vacc10x0123, vright_shift);
-    vacc10x4567 = vrshlq_s32(vacc10x4567, vright_shift);
-    vacc11x0123 = vrshlq_s32(vacc11x0123, vright_shift);
-    vacc11x4567 = vrshlq_s32(vacc11x4567, vright_shift);
-
-    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
-#if XNN_ARCH_ARM64
-    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
-    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
-    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
-    const int16x8_t vacc3x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
-    const int16x8_t vacc4x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc4x0123), vacc4x4567), voutput_zero_point);
-    const int16x8_t vacc5x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc5x0123), vacc5x4567), voutput_zero_point);
-    const int16x8_t vacc6x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc6x0123), vacc6x4567), voutput_zero_point);
-    const int16x8_t vacc7x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc7x0123), vacc7x4567), voutput_zero_point);
-    const int16x8_t vacc8x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc8x0123), vacc8x4567), voutput_zero_point);
-    const int16x8_t vacc9x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc9x0123), vacc9x4567), voutput_zero_point);
-    const int16x8_t vacc10x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc10x0123), vacc10x4567), voutput_zero_point);
-    const int16x8_t vacc11x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc11x0123), vacc11x4567), voutput_zero_point);
-
-    int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
-    int8x16_t vout2x01234567_3x01234567 = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc3x01234567);
-    int8x16_t vout4x01234567_5x01234567 = vqmovn_high_s16(vqmovn_s16(vacc4x01234567), vacc5x01234567);
-    int8x16_t vout6x01234567_7x01234567 = vqmovn_high_s16(vqmovn_s16(vacc6x01234567), vacc7x01234567);
-    int8x16_t vout8x01234567_9x01234567 = vqmovn_high_s16(vqmovn_s16(vacc8x01234567), vacc9x01234567);
-    int8x16_t vout10x01234567_11x01234567 = vqmovn_high_s16(vqmovn_s16(vacc10x01234567), vacc11x01234567);
-#else
-    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
-    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
-    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
-    const int16x8_t vacc3x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567)), voutput_zero_point);
-    const int16x8_t vacc4x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc4x0123), vqmovn_s32(vacc4x4567)), voutput_zero_point);
-    const int16x8_t vacc5x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc5x0123), vqmovn_s32(vacc5x4567)), voutput_zero_point);
-    const int16x8_t vacc6x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc6x0123), vqmovn_s32(vacc6x4567)), voutput_zero_point);
-    const int16x8_t vacc7x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc7x0123), vqmovn_s32(vacc7x4567)), voutput_zero_point);
-    const int16x8_t vacc8x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc8x0123), vqmovn_s32(vacc8x4567)), voutput_zero_point);
-    const int16x8_t vacc9x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc9x0123), vqmovn_s32(vacc9x4567)), voutput_zero_point);
-    const int16x8_t vacc10x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc10x0123), vqmovn_s32(vacc10x4567)), voutput_zero_point);
-    const int16x8_t vacc11x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc11x0123), vqmovn_s32(vacc11x4567)), voutput_zero_point);
-
-    int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
-    int8x16_t vout2x01234567_3x01234567 = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc3x01234567));
-    int8x16_t vout4x01234567_5x01234567 = vcombine_s8(vqmovn_s16(vacc4x01234567), vqmovn_s16(vacc5x01234567));
-    int8x16_t vout6x01234567_7x01234567 = vcombine_s8(vqmovn_s16(vacc6x01234567), vqmovn_s16(vacc7x01234567));
-    int8x16_t vout8x01234567_9x01234567 = vcombine_s8(vqmovn_s16(vacc8x01234567), vqmovn_s16(vacc9x01234567));
-    int8x16_t vout10x01234567_11x01234567 = vcombine_s8(vqmovn_s16(vacc10x01234567), vqmovn_s16(vacc11x01234567));
-#endif
-    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
-    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
-
-    vout10x01234567_11x01234567 = vmaxq_s8(vout10x01234567_11x01234567, voutput_min);
-    vout8x01234567_9x01234567 = vmaxq_s8(vout8x01234567_9x01234567, voutput_min);
-    vout6x01234567_7x01234567 = vmaxq_s8(vout6x01234567_7x01234567, voutput_min);
-    vout4x01234567_5x01234567 = vmaxq_s8(vout4x01234567_5x01234567, voutput_min);
-    vout2x01234567_3x01234567 = vmaxq_s8(vout2x01234567_3x01234567, voutput_min);
-    vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);
-
-    vout10x01234567_11x01234567 = vminq_s8(vout10x01234567_11x01234567, voutput_max);
-    vout8x01234567_9x01234567 = vminq_s8(vout8x01234567_9x01234567, voutput_max);
-    vout6x01234567_7x01234567 = vminq_s8(vout6x01234567_7x01234567, voutput_max);
-    vout4x01234567_5x01234567 = vminq_s8(vout4x01234567_5x01234567, voutput_max);
-    vout2x01234567_3x01234567 = vminq_s8(vout2x01234567_3x01234567, voutput_max);
-    vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);
-
-    if (nc >= 8) {
-      vst1_s8(c11 + 0, vget_high_s8(vout10x01234567_11x01234567));
-      vst1_s8(c10 + 0, vget_low_s8(vout10x01234567_11x01234567));
-      vst1_s8(c9 + 0, vget_high_s8(vout8x01234567_9x01234567));
-      vst1_s8(c8 + 0, vget_low_s8(vout8x01234567_9x01234567));
-      vst1_s8(c7 + 0, vget_high_s8(vout6x01234567_7x01234567));
-      vst1_s8(c6 + 0, vget_low_s8(vout6x01234567_7x01234567));
-      vst1_s8(c5 + 0, vget_high_s8(vout4x01234567_5x01234567));
-      vst1_s8(c4 + 0, vget_low_s8(vout4x01234567_5x01234567));
-      vst1_s8(c3 + 0, vget_high_s8(vout2x01234567_3x01234567));
-      vst1_s8(c2 + 0, vget_low_s8(vout2x01234567_3x01234567));
-      vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
-      vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
-
-      c11 = (int8_t*) ((uintptr_t) c11 + cn_stride);
-      c10 = (int8_t*) ((uintptr_t) c10 + cn_stride);
-      c9 = (int8_t*) ((uintptr_t) c9 + cn_stride);
-      c8 = (int8_t*) ((uintptr_t) c8 + cn_stride);
-      c7 = (int8_t*) ((uintptr_t) c7 + cn_stride);
-      c6 = (int8_t*) ((uintptr_t) c6 + cn_stride);
-      c5 = (int8_t*) ((uintptr_t) c5 + cn_stride);
-      c4 = (int8_t*) ((uintptr_t) c4 + cn_stride);
-      c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
-      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
-      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
-      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
-
-      a = (const int8_t**restrict) ((uintptr_t) a - ks);
-
-      nc -= 8;
-    } else {
-      if (nc & 4) {
-        vst1q_lane_u32(__builtin_assume_aligned(c11, 1), vreinterpretq_u32_s8(vout10x01234567_11x01234567), 2); c11 += 4;
-        vst1q_lane_u32(__builtin_assume_aligned(c10, 1), vreinterpretq_u32_s8(vout10x01234567_11x01234567), 0); c10 += 4;
-        vst1q_lane_u32(__builtin_assume_aligned(c9, 1), vreinterpretq_u32_s8(vout8x01234567_9x01234567), 2); c9 += 4;
-        vst1q_lane_u32(__builtin_assume_aligned(c8, 1), vreinterpretq_u32_s8(vout8x01234567_9x01234567), 0); c8 += 4;
-        vst1q_lane_u32(__builtin_assume_aligned(c7, 1), vreinterpretq_u32_s8(vout6x01234567_7x01234567), 2); c7 += 4;
-        vst1q_lane_u32(__builtin_assume_aligned(c6, 1), vreinterpretq_u32_s8(vout6x01234567_7x01234567), 0); c6 += 4;
-        vst1q_lane_u32(__builtin_assume_aligned(c5, 1), vreinterpretq_u32_s8(vout4x01234567_5x01234567), 2); c5 += 4;
-        vst1q_lane_u32(__builtin_assume_aligned(c4, 1), vreinterpretq_u32_s8(vout4x01234567_5x01234567), 0); c4 += 4;
-        vst1q_lane_u32(__builtin_assume_aligned(c3, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 2); c3 += 4;
-        vst1q_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 0); c2 += 4;
-        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
-        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
-        vout10x01234567_11x01234567 = vextq_s8(vout10x01234567_11x01234567, vout10x01234567_11x01234567, 4);
-        vout8x01234567_9x01234567 = vextq_s8(vout8x01234567_9x01234567, vout8x01234567_9x01234567, 4);
-        vout6x01234567_7x01234567 = vextq_s8(vout6x01234567_7x01234567, vout6x01234567_7x01234567, 4);
-        vout4x01234567_5x01234567 = vextq_s8(vout4x01234567_5x01234567, vout4x01234567_5x01234567, 4);
-        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
-        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
-      }
-      if (nc & 2) {
-        vst1q_lane_u16(__builtin_assume_aligned(c11, 1), vreinterpretq_u16_s8(vout10x01234567_11x01234567), 4); c11 += 2;
-        vst1q_lane_u16(__builtin_assume_aligned(c10, 1), vreinterpretq_u16_s8(vout10x01234567_11x01234567), 0); c10 += 2;
-        vst1q_lane_u16(__builtin_assume_aligned(c9, 1), vreinterpretq_u16_s8(vout8x01234567_9x01234567), 4); c9 += 2;
-        vst1q_lane_u16(__builtin_assume_aligned(c8, 1), vreinterpretq_u16_s8(vout8x01234567_9x01234567), 0); c8 += 2;
-        vst1q_lane_u16(__builtin_assume_aligned(c7, 1), vreinterpretq_u16_s8(vout6x01234567_7x01234567), 4); c7 += 2;
-        vst1q_lane_u16(__builtin_assume_aligned(c6, 1), vreinterpretq_u16_s8(vout6x01234567_7x01234567), 0); c6 += 2;
-        vst1q_lane_u16(__builtin_assume_aligned(c5, 1), vreinterpretq_u16_s8(vout4x01234567_5x01234567), 4); c5 += 2;
-        vst1q_lane_u16(__builtin_assume_aligned(c4, 1), vreinterpretq_u16_s8(vout4x01234567_5x01234567), 0); c4 += 2;
-        vst1q_lane_u16(__builtin_assume_aligned(c3, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 4); c3 += 2;
-        vst1q_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 0); c2 += 2;
-        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
-        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
-        vout10x01234567_11x01234567 = vextq_s8(vout10x01234567_11x01234567, vout10x01234567_11x01234567, 2);
-        vout8x01234567_9x01234567 = vextq_s8(vout8x01234567_9x01234567, vout8x01234567_9x01234567, 2);
-        vout6x01234567_7x01234567 = vextq_s8(vout6x01234567_7x01234567, vout6x01234567_7x01234567, 2);
-        vout4x01234567_5x01234567 = vextq_s8(vout4x01234567_5x01234567, vout4x01234567_5x01234567, 2);
-        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
-        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
-      }
-      if (nc & 1) {
-        vst1q_lane_s8(c11, vout10x01234567_11x01234567, 8);
-        vst1q_lane_s8(c10, vout10x01234567_11x01234567, 0);
-        vst1q_lane_s8(c9, vout8x01234567_9x01234567, 8);
-        vst1q_lane_s8(c8, vout8x01234567_9x01234567, 0);
-        vst1q_lane_s8(c7, vout6x01234567_7x01234567, 8);
-        vst1q_lane_s8(c6, vout6x01234567_7x01234567, 0);
-        vst1q_lane_s8(c5, vout4x01234567_5x01234567, 8);
-        vst1q_lane_s8(c4, vout4x01234567_5x01234567, 0);
-        vst1q_lane_s8(c3, vout2x01234567_3x01234567, 8);
-        vst1q_lane_s8(c2, vout2x01234567_3x01234567, 0);
-        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
-        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/qs8-igemm/gen/1x16-minmax-neon-mull-addw-dup.c b/src/qs8-igemm/gen/1x16-minmax-neon-mull-addw-dup.c
new file mode 100644
index 0000000..80eb6bf
--- /dev/null
+++ b/src/qs8-igemm/gen/1x16-minmax-neon-mull-addw-dup.c
@@ -0,0 +1,300 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/neon-mull-addw-dup.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/gemm.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (1 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  int8_t* c0 = c;
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      a += 1;
+
+      size_t k = kc;
+      while (k >= 8 * sizeof(int8_t)) {
+        const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+
+        const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c0));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c0));
+        const int8x8_t vb89ABCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va0, 0));
+        vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc0));
+        vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc0));
+        const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c1));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c1));
+        const int8x8_t vb89ABCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va0, 1));
+        vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc1));
+        vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc1));
+        const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va0, 2));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c2));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c2));
+        const int8x8_t vb89ABCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va0, 2));
+        vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc2));
+        vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc2));
+        const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3));
+        const int8x8_t vb89ABCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va0, 3));
+        vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc3));
+        vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc3));
+        const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va0, 4));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c4));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c4));
+        const int8x8_t vb89ABCDEFc4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va0, 4));
+        vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc4));
+        vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc4));
+        const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va0, 5));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c5));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c5));
+        const int8x8_t vb89ABCDEFc5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va0, 5));
+        vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc5));
+        vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc5));
+        const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va0, 6));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c6));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c6));
+        const int8x8_t vb89ABCDEFc6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va0, 6));
+        vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc6));
+        vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc6));
+        const int8x8_t vb01234567c7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va0, 7));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c7));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c7));
+        const int8x8_t vb89ABCDEFc7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x89ABCDEFc7 = vmull_s8(vb89ABCDEFc7, vdup_lane_s8(va0, 7));
+        vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc7));
+        vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc7));
+
+        k -= 8 * sizeof(int8_t);
+      }
+      if XNN_UNLIKELY(k != 0) {
+        const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+
+        const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c0));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c0));
+        const int16x8_t vprod0x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va0, 0));
+        vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc0));
+        vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc0));
+
+        if (k >= 2 * sizeof(int8_t)) {
+          const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb89ABCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1));
+          vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c1));
+          vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c1));
+          const int16x8_t vprod0x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va0, 1));
+          vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc1));
+          vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc1));
+
+          if (k > 2 * sizeof(int8_t)) {
+            const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            const int8x8_t vb89ABCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+            const int16x8_t vprod0x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va0, 2));
+            vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c2));
+            vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c2));
+            const int16x8_t vprod0x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va0, 2));
+            vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc2));
+            vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc2));
+
+            if (k >= 4 * sizeof(int8_t)) {
+              const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+              const int8x8_t vb89ABCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+              const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3));
+              vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3));
+              vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3));
+              const int16x8_t vprod0x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va0, 3));
+              vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc3));
+              vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc3));
+
+              if (k > 4 * sizeof(int8_t)) {
+                const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+                const int8x8_t vb89ABCDEFc4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+                const int16x8_t vprod0x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va0, 4));
+                vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c4));
+                vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c4));
+                const int16x8_t vprod0x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va0, 4));
+                vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc4));
+                vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc4));
+
+                if (k >= 6 * sizeof(int8_t)) {
+                  const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+                  const int8x8_t vb89ABCDEFc5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+                  const int16x8_t vprod0x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va0, 5));
+                  vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c5));
+                  vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c5));
+                  const int16x8_t vprod0x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va0, 5));
+                  vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc5));
+                  vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc5));
+
+                  if (k > 6 * sizeof(int8_t)) {
+                    const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+                    const int8x8_t vb89ABCDEFc6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+                    const int16x8_t vprod0x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va0, 6));
+                    vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c6));
+                    vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c6));
+                    const int16x8_t vprod0x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va0, 6));
+                    vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc6));
+                    vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc6));
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+      p -= 1 * sizeof(void*);
+    } while (p != 0);
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+    vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+    vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+
+    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+
+    if (nc >= 16) {
+      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 16;
+    } else {
+      int8x8_t vout0x01234567 = vget_low_s8(vout0x0123456789ABCDEF);
+      if (nc & 8) {
+        vst1_s8(c0, vout0x01234567); c0 += 8;
+        vout0x01234567 = vget_high_s8(vout0x0123456789ABCDEF);
+      }
+      if (nc & 4) {
+        vst1_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpret_u32_s8(vout0x01234567), 0); c0 += 4;
+        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpret_u16_s8(vout0x01234567), 0); c0 += 2;
+        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1_lane_s8(c0, vout0x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/1x16c16-minmax-neon-mlal-padal.c b/src/qs8-igemm/gen/1x16c16-minmax-neon-mlal-padal.c
new file mode 100644
index 0000000..302733a
--- /dev/null
+++ b/src/qs8-igemm/gen/1x16c16-minmax-neon-mlal-padal.c
@@ -0,0 +1,258 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/c16-neon-mlal-padal.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (1 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 16);
+  int8_t* c0 = c;
+
+  do {
+    int32x4_t vacc0x0 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x1 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x2 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x3 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x4 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x5 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x8 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x9 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x10 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x11 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x12 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x13 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x14 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x15 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      a += 1;
+
+      // KC loop of 16 with up to 15 remainder
+      size_t k = 0;
+      while (k < kc) {
+        const int8x16_t va0 = vld1q_s8(a0); a0 += 16;
+
+        const int8x16_t vb0 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb1 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb2 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb3 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb4 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb5 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb6 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb7 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb8 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb9 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb10 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb11 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb12 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb13 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb14 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb15 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+
+        int16x8_t vprod0x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va0));
+        vprod0x0 = vmlal_s8(vprod0x0, vget_high_s8(vb0), vget_high_s8(va0));
+        vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+        int16x8_t vprod0x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va0));
+        vprod0x1 = vmlal_s8(vprod0x1, vget_high_s8(vb1), vget_high_s8(va0));
+        vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+        int16x8_t vprod0x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va0));
+        vprod0x2 = vmlal_s8(vprod0x2, vget_high_s8(vb2), vget_high_s8(va0));
+        vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+        int16x8_t vprod0x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va0));
+        vprod0x3 = vmlal_s8(vprod0x3, vget_high_s8(vb3), vget_high_s8(va0));
+        vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+        int16x8_t vprod0x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va0));
+        vprod0x4 = vmlal_s8(vprod0x4, vget_high_s8(vb4), vget_high_s8(va0));
+        vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+        int16x8_t vprod0x5 = vmull_s8(vget_low_s8(vb5), vget_low_s8(va0));
+        vprod0x5 = vmlal_s8(vprod0x5, vget_high_s8(vb5), vget_high_s8(va0));
+        vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+        int16x8_t vprod0x6 = vmull_s8(vget_low_s8(vb6), vget_low_s8(va0));
+        vprod0x6 = vmlal_s8(vprod0x6, vget_high_s8(vb6), vget_high_s8(va0));
+        vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+        int16x8_t vprod0x7 = vmull_s8(vget_low_s8(vb7), vget_low_s8(va0));
+        vprod0x7 = vmlal_s8(vprod0x7, vget_high_s8(vb7), vget_high_s8(va0));
+        vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+        int16x8_t vprod0x8 = vmull_s8(vget_low_s8(vb8), vget_low_s8(va0));
+        vprod0x8 = vmlal_s8(vprod0x8, vget_high_s8(vb8), vget_high_s8(va0));
+        vacc0x8 = vpadalq_s16(vacc0x8, vprod0x8);
+        int16x8_t vprod0x9 = vmull_s8(vget_low_s8(vb9), vget_low_s8(va0));
+        vprod0x9 = vmlal_s8(vprod0x9, vget_high_s8(vb9), vget_high_s8(va0));
+        vacc0x9 = vpadalq_s16(vacc0x9, vprod0x9);
+        int16x8_t vprod0x10 = vmull_s8(vget_low_s8(vb10), vget_low_s8(va0));
+        vprod0x10 = vmlal_s8(vprod0x10, vget_high_s8(vb10), vget_high_s8(va0));
+        vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10);
+        int16x8_t vprod0x11 = vmull_s8(vget_low_s8(vb11), vget_low_s8(va0));
+        vprod0x11 = vmlal_s8(vprod0x11, vget_high_s8(vb11), vget_high_s8(va0));
+        vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11);
+        int16x8_t vprod0x12 = vmull_s8(vget_low_s8(vb12), vget_low_s8(va0));
+        vprod0x12 = vmlal_s8(vprod0x12, vget_high_s8(vb12), vget_high_s8(va0));
+        vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12);
+        int16x8_t vprod0x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va0));
+        vprod0x13 = vmlal_s8(vprod0x13, vget_high_s8(vb13), vget_high_s8(va0));
+        vacc0x13 = vpadalq_s16(vacc0x13, vprod0x13);
+        int16x8_t vprod0x14 = vmull_s8(vget_low_s8(vb14), vget_low_s8(va0));
+        vprod0x14 = vmlal_s8(vprod0x14, vget_high_s8(vb14), vget_high_s8(va0));
+        vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14);
+        int16x8_t vprod0x15 = vmull_s8(vget_low_s8(vb15), vget_low_s8(va0));
+        vprod0x15 = vmlal_s8(vprod0x15, vget_high_s8(vb15), vget_high_s8(va0));
+        vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15);
+
+        k += 16 * sizeof(int8_t);
+      }
+
+      p -= 1 * sizeof(void*);
+    } while (p != 0);
+
+#if XNN_ARCH_ARM64
+    const int32x4_t vsum0x01 = vpaddq_s32(vacc0x0, vacc0x1);
+    const int32x4_t vsum0x23 = vpaddq_s32(vacc0x2, vacc0x3);
+    const int32x4_t vsum0x45 = vpaddq_s32(vacc0x4, vacc0x5);
+    const int32x4_t vsum0x67 = vpaddq_s32(vacc0x6, vacc0x7);
+    const int32x4_t vsum0x89 = vpaddq_s32(vacc0x8, vacc0x9);
+    const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11);
+    const int32x4_t vsum0xCD = vpaddq_s32(vacc0x12, vacc0x13);
+    const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15);
+    int32x4_t vacc0x0123 = vpaddq_s32(vsum0x01, vsum0x23);
+    int32x4_t vacc0x4567 = vpaddq_s32(vsum0x45, vsum0x67);
+    int32x4_t vacc0x89AB = vpaddq_s32(vsum0x89, vsum0xAB);
+    int32x4_t vacc0xCDEF = vpaddq_s32(vsum0xCD, vsum0xEF);
+#else
+    const int32x2_t vpsum0x0 = vadd_s32(vget_low_s32(vacc0x0), vget_high_s32(vacc0x0));
+    const int32x2_t vpsum0x1 = vadd_s32(vget_low_s32(vacc0x1), vget_high_s32(vacc0x1));
+    const int32x2_t vpsum0x2 = vadd_s32(vget_low_s32(vacc0x2), vget_high_s32(vacc0x2));
+    const int32x2_t vpsum0x3 = vadd_s32(vget_low_s32(vacc0x3), vget_high_s32(vacc0x3));
+    const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1);
+    const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3);
+    int32x4_t vacc0x0123 = vcombine_s32(vsum0x01, vsum0x23 );
+    const int32x2_t vpsum0x4 = vadd_s32(vget_low_s32(vacc0x4), vget_high_s32(vacc0x4));
+    const int32x2_t vpsum0x5 = vadd_s32(vget_low_s32(vacc0x5), vget_high_s32(vacc0x5));
+    const int32x2_t vpsum0x6 = vadd_s32(vget_low_s32(vacc0x6), vget_high_s32(vacc0x6));
+    const int32x2_t vpsum0x7 = vadd_s32(vget_low_s32(vacc0x7), vget_high_s32(vacc0x7));
+    const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5);
+    const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7);
+    int32x4_t vacc0x4567 = vcombine_s32(vsum0x45, vsum0x67 );
+    const int32x2_t vpsum0x8 = vadd_s32(vget_low_s32(vacc0x8), vget_high_s32(vacc0x8));
+    const int32x2_t vpsum0x9 = vadd_s32(vget_low_s32(vacc0x9), vget_high_s32(vacc0x9));
+    const int32x2_t vpsum0xA = vadd_s32(vget_low_s32(vacc0x10), vget_high_s32(vacc0x10));
+    const int32x2_t vpsum0xB = vadd_s32(vget_low_s32(vacc0x11), vget_high_s32(vacc0x11));
+    const int32x2_t vsum0x89 = vpadd_s32(vpsum0x8, vpsum0x9);
+    const int32x2_t vsum0xAB = vpadd_s32(vpsum0xA, vpsum0xB);
+    int32x4_t vacc0x89AB = vcombine_s32(vsum0x89, vsum0xAB );
+    const int32x2_t vpsum0xC = vadd_s32(vget_low_s32(vacc0x12), vget_high_s32(vacc0x12));
+    const int32x2_t vpsum0xD = vadd_s32(vget_low_s32(vacc0x13), vget_high_s32(vacc0x13));
+    const int32x2_t vpsum0xE = vadd_s32(vget_low_s32(vacc0x14), vget_high_s32(vacc0x14));
+    const int32x2_t vpsum0xF = vadd_s32(vget_low_s32(vacc0x15), vget_high_s32(vacc0x15));
+    const int32x2_t vsum0xCD = vpadd_s32(vpsum0xC, vpsum0xD);
+    const int32x2_t vsum0xEF = vpadd_s32(vpsum0xE, vpsum0xF);
+    int32x4_t vacc0xCDEF = vcombine_s32(vsum0xCD, vsum0xEF );
+#endif
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+    vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+    vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+
+    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+
+    if (nc >= 16) {
+      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 16;
+    } else {
+      int8x8_t vout0x01234567 = vget_low_s8(vout0x0123456789ABCDEF);
+      if (nc & 8) {
+        vst1_s8(c0, vout0x01234567); c0 += 8;
+        vout0x01234567 = vget_high_s8(vout0x0123456789ABCDEF);
+      }
+      if (nc & 4) {
+        vst1_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpret_u32_s8(vout0x01234567), 0); c0 += 4;
+        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpret_u16_s8(vout0x01234567), 0); c0 += 2;
+        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1_lane_s8(c0, vout0x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/1x16c2-minmax-neon-mlal-padal-dup.c b/src/qs8-igemm/gen/1x16c2-minmax-neon-mlal-padal-dup.c
new file mode 100644
index 0000000..84cc0e7
--- /dev/null
+++ b/src/qs8-igemm/gen/1x16c2-minmax-neon-mlal-padal-dup.c
@@ -0,0 +1,325 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/c2-neon-mull-padal-dup.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (1 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  int8_t* c0 = c;
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      a += 1;
+
+      size_t k = kc;
+
+      while (k >= 16 * sizeof(int8_t)) {
+        const int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va0x1 = vld1_s8(a0); a0 += 8;
+
+        const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+        const int8x8_t vb0123c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+        int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+        const int8x8_t vb4567c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+        int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+        const int8x8_t vb89ABc0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x89ABc0 = vmlal_s8(vprod0x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+        int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+        const int8x8_t vbCDEFc0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0xCDEFc0 = vmlal_s8(vprod0xCDEFc0, vbCDEFc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+        int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+        const int8x8_t vb0123c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x0123c1 = vmlal_s8(vprod0x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+        int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+        const int8x8_t vb4567c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x4567c1 = vmlal_s8(vprod0x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+        int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+        const int8x8_t vb89ABc1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x89ABc1 = vmlal_s8(vprod0x89ABc1, vb89ABc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+        int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+        const int8x8_t vbCDEFc1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0xCDEFc1 = vmlal_s8(vprod0xCDEFc1, vbCDEFc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+        int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+        const int8x8_t vb0123c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x0123c2 = vmlal_s8(vprod0x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+        int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+        const int8x8_t vb4567c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x4567c2 = vmlal_s8(vprod0x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+        int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+        const int8x8_t vb89ABc2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x89ABc2 = vmlal_s8(vprod0x89ABc2, vb89ABc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+        int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+        const int8x8_t vbCDEFc2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0xCDEFc2 = vmlal_s8(vprod0xCDEFc2, vbCDEFc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+        int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+        const int8x8_t vb0123c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x0123c3 = vmlal_s8(vprod0x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+        int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+        const int8x8_t vb4567c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x4567c3 = vmlal_s8(vprod0x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+        int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+        const int8x8_t vb89ABc3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x89ABc3 = vmlal_s8(vprod0x89ABc3, vb89ABc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
+        int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+        const int8x8_t vbCDEFc3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0xCDEFc3 = vmlal_s8(vprod0xCDEFc3, vbCDEFc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
+
+        k -= 16 * sizeof(int8_t);
+      }
+
+      if (k >= 8 * sizeof(int8_t)) {
+        const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+
+        const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+        const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+        const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+        const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+        const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+        const int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
+        const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+        const int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
+
+        k -= 8 * sizeof(int8_t);
+      }
+
+      if XNN_UNLIKELY(k != 0) {
+        const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+
+        const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+        const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+        const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+        const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+
+        if (k > 2 * sizeof(int8_t)) {
+          const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+          vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+          const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+          vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+          const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+          vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+          const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+          vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+
+          if (k > 4 * sizeof(int8_t)) {
+            const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+            const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+            vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+            const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+            vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+            const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+            vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+            const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+            vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+          }
+        }
+      }
+      p -= 1 * sizeof(void*);
+    } while (p != 0);
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+    vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+    vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+
+    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+
+    if (nc >= 16) {
+      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 16;
+    } else {
+      int8x8_t vout0x01234567 = vget_low_s8(vout0x0123456789ABCDEF);
+      if (nc & 8) {
+        vst1_s8(c0, vout0x01234567); c0 += 8;
+        vout0x01234567 = vget_high_s8(vout0x0123456789ABCDEF);
+      }
+      if (nc & 4) {
+        vst1_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpret_u32_s8(vout0x01234567), 0); c0 += 4;
+        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpret_u16_s8(vout0x01234567), 0); c0 += 2;
+        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1_lane_s8(c0, vout0x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/1x16c2-minmax-neon-mull-padal-dup.c b/src/qs8-igemm/gen/1x16c2-minmax-neon-mull-padal-dup.c
new file mode 100644
index 0000000..9ae6fc7
--- /dev/null
+++ b/src/qs8-igemm/gen/1x16c2-minmax-neon-mull-padal-dup.c
@@ -0,0 +1,237 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/c2-neon-mull-padal-dup.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (1 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  int8_t* c0 = c;
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      a += 1;
+
+      size_t k = kc;
+
+
+      while (k >= 8 * sizeof(int8_t)) {
+        const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+
+        const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+        const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+        const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+        const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+        const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+        const int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
+        const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+        const int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
+
+        k -= 8 * sizeof(int8_t);
+      }
+
+      if XNN_UNLIKELY(k != 0) {
+        const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+
+        const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+        const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+        const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+        const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+
+        if (k > 2 * sizeof(int8_t)) {
+          const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+          vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+          const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+          vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+          const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+          vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+          const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+          vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+
+          if (k > 4 * sizeof(int8_t)) {
+            const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+            const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+            vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+            const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+            vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+            const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+            vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+            const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+            vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+          }
+        }
+      }
+      p -= 1 * sizeof(void*);
+    } while (p != 0);
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+    vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+    vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+
+    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+
+    if (nc >= 16) {
+      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 16;
+    } else {
+      int8x8_t vout0x01234567 = vget_low_s8(vout0x0123456789ABCDEF);
+      if (nc & 8) {
+        vst1_s8(c0, vout0x01234567); c0 += 8;
+        vout0x01234567 = vget_high_s8(vout0x0123456789ABCDEF);
+      }
+      if (nc & 4) {
+        vst1_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpret_u32_s8(vout0x01234567), 0); c0 += 4;
+        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpret_u16_s8(vout0x01234567), 0); c0 += 2;
+        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1_lane_s8(c0, vout0x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/1x16c4-minmax-neondot.c b/src/qs8-igemm/gen/1x16c4-minmax-neondot.c
index 984dfb8..afb0cea 100644
--- a/src/qs8-igemm/gen/1x16c4-minmax-neondot.c
+++ b/src/qs8-igemm/gen/1x16c4-minmax-neondot.c
@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot(
@@ -40,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 4);
   int8_t* c0 = c;
 
   do {
@@ -64,14 +65,14 @@
         const int8x8_t va0x01234567 = vld1_s8(a0); a0 += 8;
 
         // Load a 8x16 block of weights.
-        const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb0123x89AB = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb0123xCDEF = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb4567x89AB = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb4567xCDEF = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
+        const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb0123x89AB = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb0123xCDEF = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb4567x89AB = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb4567xCDEF = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
 
         // Multiply-accumulate: 1x8 * 8x16 --> 1x16.
         vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb0123x0123, va0x01234567, 0);
@@ -85,36 +86,22 @@
 
         k -= 8 * sizeof(int8_t);
       }
-      // Handle up to 7 final positions of `k`
+      // Handle up to 4 final positions of `k`
       if XNN_UNLIKELY(k != 0) {
         // Load a 1x4 block of activations.
         const int8x8_t va0x01234567 = vld1_s8(a0);
 
         // Load a 4x16 block of weights.
-        const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb0123x89AB = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb0123xCDEF = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
+        const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb0123x89AB = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb0123xCDEF = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
 
         // Multiply-accumulate: 1x4 * 4x16 --> 1x16.
         vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb0123x0123, va0x01234567, 0);
         vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb0123x4567, va0x01234567, 0);
         vacc0x89AB = vdotq_lane_s32(vacc0x89AB, vb0123x89AB, va0x01234567, 0);
         vacc0xCDEF = vdotq_lane_s32(vacc0xCDEF, vb0123xCDEF, va0x01234567, 0);
-
-        if (k > 4) {
-          // Load a 4x16 block of weights.
-          const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-          const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-          const int8x16_t vb4567x89AB = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-          const int8x16_t vb4567xCDEF = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-
-          // Multiply-accumulate: 1x4 * 4x16 --> 1x16.
-          vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
-          vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
-          vacc0x89AB = vdotq_lane_s32(vacc0x89AB, vb4567x89AB, va0x01234567, 1);
-          vacc0xCDEF = vdotq_lane_s32(vacc0xCDEF, vb4567xCDEF, va0x01234567, 1);
-        }
       }
       p -= 1 * sizeof(void*);
     } while (p != 0);
diff --git a/src/qs8-igemm/gen/1x16c8-minmax-avx512skx.c b/src/qs8-igemm/gen/1x16c8-minmax-avx512skx.c
index 924734f..34e6a19 100644
--- a/src/qs8-igemm/gen/1x16c8-minmax-avx512skx.c
+++ b/src/qs8-igemm/gen/1x16c8-minmax-avx512skx.c
@@ -13,6 +13,7 @@
 
 #include <xnnpack/igemm.h>
 #include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx(
@@ -38,6 +39,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
 
   const __mmask16 vbias_mask = _cvtu32_mask16(0x1111);
diff --git a/src/qs8-igemm/gen/1x16c8-minmax-neon-mlal-padal.c b/src/qs8-igemm/gen/1x16c8-minmax-neon-mlal-padal.c
new file mode 100644
index 0000000..4304b19
--- /dev/null
+++ b/src/qs8-igemm/gen/1x16c8-minmax-neon-mlal-padal.c
@@ -0,0 +1,331 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/c8-neon-mull-padal.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (1 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  int8_t* c0 = c;
+
+  do {
+    int32x4_t vacc0x0 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x1 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x2 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x3 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x4 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x5 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x8 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x9 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x10 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x11 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x12 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x13 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x14 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x15 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      a += 1;
+
+      size_t k = kc;
+      // 2x partial unrolled loop to load 16 bytes at a time using MLA.
+      while (k >= 16 * sizeof(int8_t)) {
+        const int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va0x1 = vld1_s8(a0); a0 += 8;
+
+        const int8x8_t vb0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb4x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb5x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb6x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb7x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb8x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb9x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb10x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb11x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb12x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb13x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb14x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb15x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+
+        const int8x8_t vb0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x0 = vmull_s8(vb0x0, va0x0);
+        vprod0x0 = vmlal_s8(vprod0x0, vb0x1, va0x1);
+        vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+        const int8x8_t vb1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x1 = vmull_s8(vb1x0, va0x0);
+        vprod0x1 = vmlal_s8(vprod0x1, vb1x1, va0x1);
+        vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+        const int8x8_t vb2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x2 = vmull_s8(vb2x0, va0x0);
+        vprod0x2 = vmlal_s8(vprod0x2, vb2x1, va0x1);
+        vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+        const int8x8_t vb3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x3 = vmull_s8(vb3x0, va0x0);
+        vprod0x3 = vmlal_s8(vprod0x3, vb3x1, va0x1);
+        vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+        const int8x8_t vb4x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x4 = vmull_s8(vb4x0, va0x0);
+        vprod0x4 = vmlal_s8(vprod0x4, vb4x1, va0x1);
+        vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+        const int8x8_t vb5x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x5 = vmull_s8(vb5x0, va0x0);
+        vprod0x5 = vmlal_s8(vprod0x5, vb5x1, va0x1);
+        vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+        const int8x8_t vb6x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x6 = vmull_s8(vb6x0, va0x0);
+        vprod0x6 = vmlal_s8(vprod0x6, vb6x1, va0x1);
+        vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+        const int8x8_t vb7x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x7 = vmull_s8(vb7x0, va0x0);
+        vprod0x7 = vmlal_s8(vprod0x7, vb7x1, va0x1);
+        vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+        const int8x8_t vb8x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x8 = vmull_s8(vb8x0, va0x0);
+        vprod0x8 = vmlal_s8(vprod0x8, vb8x1, va0x1);
+        vacc0x8 = vpadalq_s16(vacc0x8, vprod0x8);
+        const int8x8_t vb9x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x9 = vmull_s8(vb9x0, va0x0);
+        vprod0x9 = vmlal_s8(vprod0x9, vb9x1, va0x1);
+        vacc0x9 = vpadalq_s16(vacc0x9, vprod0x9);
+        const int8x8_t vb10x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x10 = vmull_s8(vb10x0, va0x0);
+        vprod0x10 = vmlal_s8(vprod0x10, vb10x1, va0x1);
+        vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10);
+        const int8x8_t vb11x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x11 = vmull_s8(vb11x0, va0x0);
+        vprod0x11 = vmlal_s8(vprod0x11, vb11x1, va0x1);
+        vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11);
+        const int8x8_t vb12x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x12 = vmull_s8(vb12x0, va0x0);
+        vprod0x12 = vmlal_s8(vprod0x12, vb12x1, va0x1);
+        vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12);
+        const int8x8_t vb13x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x13 = vmull_s8(vb13x0, va0x0);
+        vprod0x13 = vmlal_s8(vprod0x13, vb13x1, va0x1);
+        vacc0x13 = vpadalq_s16(vacc0x13, vprod0x13);
+        const int8x8_t vb14x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x14 = vmull_s8(vb14x0, va0x0);
+        vprod0x14 = vmlal_s8(vprod0x14, vb14x1, va0x1);
+        vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14);
+        const int8x8_t vb15x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x15 = vmull_s8(vb15x0, va0x0);
+        vprod0x15 = vmlal_s8(vprod0x15, vb15x1, va0x1);
+        vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15);
+
+        k -= 16 * sizeof(int8_t);
+      }
+
+      // Handle 8 bytes at a time using MUL.
+      if (k > 0) {
+        const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+
+        const int8x8_t vb0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x0 = vmull_s8(vb0, va0);
+        vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+        const int8x8_t vb1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x1 = vmull_s8(vb1, va0);
+        vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+        const int8x8_t vb2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x2 = vmull_s8(vb2, va0);
+        vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+        const int8x8_t vb3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x3 = vmull_s8(vb3, va0);
+        vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+        const int8x8_t vb4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x4 = vmull_s8(vb4, va0);
+        vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+        const int8x8_t vb5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x5 = vmull_s8(vb5, va0);
+        vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+        const int8x8_t vb6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x6 = vmull_s8(vb6, va0);
+        vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+        const int8x8_t vb7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x7 = vmull_s8(vb7, va0);
+        vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+        const int8x8_t vb8 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x8 = vmull_s8(vb8, va0);
+        vacc0x8 = vpadalq_s16(vacc0x8, vprod0x8);
+        const int8x8_t vb9 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x9 = vmull_s8(vb9, va0);
+        vacc0x9 = vpadalq_s16(vacc0x9, vprod0x9);
+        const int8x8_t vb10 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x10 = vmull_s8(vb10, va0);
+        vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10);
+        const int8x8_t vb11 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x11 = vmull_s8(vb11, va0);
+        vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11);
+        const int8x8_t vb12 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x12 = vmull_s8(vb12, va0);
+        vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12);
+        const int8x8_t vb13 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x13 = vmull_s8(vb13, va0);
+        vacc0x13 = vpadalq_s16(vacc0x13, vprod0x13);
+        const int8x8_t vb14 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x14 = vmull_s8(vb14, va0);
+        vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14);
+        const int8x8_t vb15 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x15 = vmull_s8(vb15, va0);
+        vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15);
+
+        k -= 8 * sizeof(int8_t);
+      }
+
+      p -= 1 * sizeof(void*);
+    } while (p != 0);
+
+#if XNN_ARCH_ARM64
+    const int32x4_t vsum0x01 = vpaddq_s32(vacc0x0, vacc0x1);
+    const int32x4_t vsum0x23 = vpaddq_s32(vacc0x2, vacc0x3);
+    const int32x4_t vsum0x45 = vpaddq_s32(vacc0x4, vacc0x5);
+    const int32x4_t vsum0x67 = vpaddq_s32(vacc0x6, vacc0x7);
+    const int32x4_t vsum0x89 = vpaddq_s32(vacc0x8, vacc0x9);
+    const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11);
+    const int32x4_t vsum0xCD = vpaddq_s32(vacc0x12, vacc0x13);
+    const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15);
+    int32x4_t vacc0x0123 = vpaddq_s32(vsum0x01, vsum0x23);
+    int32x4_t vacc0x4567 = vpaddq_s32(vsum0x45, vsum0x67);
+    int32x4_t vacc0x89AB = vpaddq_s32(vsum0x89, vsum0xAB);
+    int32x4_t vacc0xCDEF = vpaddq_s32(vsum0xCD, vsum0xEF);
+#else
+    const int32x2_t vpsum0x0 = vadd_s32(vget_low_s32(vacc0x0), vget_high_s32(vacc0x0));
+    const int32x2_t vpsum0x1 = vadd_s32(vget_low_s32(vacc0x1), vget_high_s32(vacc0x1));
+    const int32x2_t vpsum0x2 = vadd_s32(vget_low_s32(vacc0x2), vget_high_s32(vacc0x2));
+    const int32x2_t vpsum0x3 = vadd_s32(vget_low_s32(vacc0x3), vget_high_s32(vacc0x3));
+    const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1);
+    const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3);
+    int32x4_t vacc0x0123 = vcombine_s32(vsum0x01, vsum0x23 );
+    const int32x2_t vpsum0x4 = vadd_s32(vget_low_s32(vacc0x4), vget_high_s32(vacc0x4));
+    const int32x2_t vpsum0x5 = vadd_s32(vget_low_s32(vacc0x5), vget_high_s32(vacc0x5));
+    const int32x2_t vpsum0x6 = vadd_s32(vget_low_s32(vacc0x6), vget_high_s32(vacc0x6));
+    const int32x2_t vpsum0x7 = vadd_s32(vget_low_s32(vacc0x7), vget_high_s32(vacc0x7));
+    const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5);
+    const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7);
+    int32x4_t vacc0x4567 = vcombine_s32(vsum0x45, vsum0x67 );
+    const int32x2_t vpsum0x8 = vadd_s32(vget_low_s32(vacc0x8), vget_high_s32(vacc0x8));
+    const int32x2_t vpsum0x9 = vadd_s32(vget_low_s32(vacc0x9), vget_high_s32(vacc0x9));
+    const int32x2_t vpsum0xA = vadd_s32(vget_low_s32(vacc0x10), vget_high_s32(vacc0x10));
+    const int32x2_t vpsum0xB = vadd_s32(vget_low_s32(vacc0x11), vget_high_s32(vacc0x11));
+    const int32x2_t vsum0x89 = vpadd_s32(vpsum0x8, vpsum0x9);
+    const int32x2_t vsum0xAB = vpadd_s32(vpsum0xA, vpsum0xB);
+    int32x4_t vacc0x89AB = vcombine_s32(vsum0x89, vsum0xAB );
+    const int32x2_t vpsum0xC = vadd_s32(vget_low_s32(vacc0x12), vget_high_s32(vacc0x12));
+    const int32x2_t vpsum0xD = vadd_s32(vget_low_s32(vacc0x13), vget_high_s32(vacc0x13));
+    const int32x2_t vpsum0xE = vadd_s32(vget_low_s32(vacc0x14), vget_high_s32(vacc0x14));
+    const int32x2_t vpsum0xF = vadd_s32(vget_low_s32(vacc0x15), vget_high_s32(vacc0x15));
+    const int32x2_t vsum0xCD = vpadd_s32(vpsum0xC, vpsum0xD);
+    const int32x2_t vsum0xEF = vpadd_s32(vpsum0xE, vpsum0xF);
+    int32x4_t vacc0xCDEF = vcombine_s32(vsum0xCD, vsum0xEF );
+#endif
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+    vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+    vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+
+    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+
+    if (nc >= 16) {
+      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 16;
+    } else {
+      int8x8_t vout0x01234567 = vget_low_s8(vout0x0123456789ABCDEF);
+      if (nc & 8) {
+        vst1_s8(c0, vout0x01234567); c0 += 8;
+        vout0x01234567 = vget_high_s8(vout0x0123456789ABCDEF);
+      }
+      if (nc & 4) {
+        vst1_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpret_u32_s8(vout0x01234567), 0); c0 += 4;
+        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpret_u16_s8(vout0x01234567), 0); c0 += 2;
+        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1_lane_s8(c0, vout0x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/1x16c8-minmax-neon-mull-padal.c b/src/qs8-igemm/gen/1x16c8-minmax-neon-mull-padal.c
new file mode 100644
index 0000000..9f4a4d3
--- /dev/null
+++ b/src/qs8-igemm/gen/1x16c8-minmax-neon-mull-padal.c
@@ -0,0 +1,242 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/c8-neon-mull-padal.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (1 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  int8_t* c0 = c;
+
+  do {
+    int32x4_t vacc0x0 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x1 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x2 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x3 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x4 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x5 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x8 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x9 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x10 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x11 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x12 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x13 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x14 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x15 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      a += 1;
+
+      size_t k = kc;
+
+      // Handle 8 bytes at a time using MUL.
+      while (k > 0) {
+        const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+
+        const int8x8_t vb0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x0 = vmull_s8(vb0, va0);
+        vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+        const int8x8_t vb1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x1 = vmull_s8(vb1, va0);
+        vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+        const int8x8_t vb2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x2 = vmull_s8(vb2, va0);
+        vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+        const int8x8_t vb3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x3 = vmull_s8(vb3, va0);
+        vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+        const int8x8_t vb4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x4 = vmull_s8(vb4, va0);
+        vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+        const int8x8_t vb5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x5 = vmull_s8(vb5, va0);
+        vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+        const int8x8_t vb6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x6 = vmull_s8(vb6, va0);
+        vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+        const int8x8_t vb7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x7 = vmull_s8(vb7, va0);
+        vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+        const int8x8_t vb8 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x8 = vmull_s8(vb8, va0);
+        vacc0x8 = vpadalq_s16(vacc0x8, vprod0x8);
+        const int8x8_t vb9 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x9 = vmull_s8(vb9, va0);
+        vacc0x9 = vpadalq_s16(vacc0x9, vprod0x9);
+        const int8x8_t vb10 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x10 = vmull_s8(vb10, va0);
+        vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10);
+        const int8x8_t vb11 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x11 = vmull_s8(vb11, va0);
+        vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11);
+        const int8x8_t vb12 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x12 = vmull_s8(vb12, va0);
+        vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12);
+        const int8x8_t vb13 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x13 = vmull_s8(vb13, va0);
+        vacc0x13 = vpadalq_s16(vacc0x13, vprod0x13);
+        const int8x8_t vb14 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x14 = vmull_s8(vb14, va0);
+        vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14);
+        const int8x8_t vb15 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x15 = vmull_s8(vb15, va0);
+        vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15);
+
+        k -= 8 * sizeof(int8_t);
+      }
+
+      p -= 1 * sizeof(void*);
+    } while (p != 0);
+
+#if XNN_ARCH_ARM64
+    const int32x4_t vsum0x01 = vpaddq_s32(vacc0x0, vacc0x1);
+    const int32x4_t vsum0x23 = vpaddq_s32(vacc0x2, vacc0x3);
+    const int32x4_t vsum0x45 = vpaddq_s32(vacc0x4, vacc0x5);
+    const int32x4_t vsum0x67 = vpaddq_s32(vacc0x6, vacc0x7);
+    const int32x4_t vsum0x89 = vpaddq_s32(vacc0x8, vacc0x9);
+    const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11);
+    const int32x4_t vsum0xCD = vpaddq_s32(vacc0x12, vacc0x13);
+    const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15);
+    int32x4_t vacc0x0123 = vpaddq_s32(vsum0x01, vsum0x23);
+    int32x4_t vacc0x4567 = vpaddq_s32(vsum0x45, vsum0x67);
+    int32x4_t vacc0x89AB = vpaddq_s32(vsum0x89, vsum0xAB);
+    int32x4_t vacc0xCDEF = vpaddq_s32(vsum0xCD, vsum0xEF);
+#else
+    const int32x2_t vpsum0x0 = vadd_s32(vget_low_s32(vacc0x0), vget_high_s32(vacc0x0));
+    const int32x2_t vpsum0x1 = vadd_s32(vget_low_s32(vacc0x1), vget_high_s32(vacc0x1));
+    const int32x2_t vpsum0x2 = vadd_s32(vget_low_s32(vacc0x2), vget_high_s32(vacc0x2));
+    const int32x2_t vpsum0x3 = vadd_s32(vget_low_s32(vacc0x3), vget_high_s32(vacc0x3));
+    const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1);
+    const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3);
+    int32x4_t vacc0x0123 = vcombine_s32(vsum0x01, vsum0x23 );
+    const int32x2_t vpsum0x4 = vadd_s32(vget_low_s32(vacc0x4), vget_high_s32(vacc0x4));
+    const int32x2_t vpsum0x5 = vadd_s32(vget_low_s32(vacc0x5), vget_high_s32(vacc0x5));
+    const int32x2_t vpsum0x6 = vadd_s32(vget_low_s32(vacc0x6), vget_high_s32(vacc0x6));
+    const int32x2_t vpsum0x7 = vadd_s32(vget_low_s32(vacc0x7), vget_high_s32(vacc0x7));
+    const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5);
+    const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7);
+    int32x4_t vacc0x4567 = vcombine_s32(vsum0x45, vsum0x67 );
+    const int32x2_t vpsum0x8 = vadd_s32(vget_low_s32(vacc0x8), vget_high_s32(vacc0x8));
+    const int32x2_t vpsum0x9 = vadd_s32(vget_low_s32(vacc0x9), vget_high_s32(vacc0x9));
+    const int32x2_t vpsum0xA = vadd_s32(vget_low_s32(vacc0x10), vget_high_s32(vacc0x10));
+    const int32x2_t vpsum0xB = vadd_s32(vget_low_s32(vacc0x11), vget_high_s32(vacc0x11));
+    const int32x2_t vsum0x89 = vpadd_s32(vpsum0x8, vpsum0x9);
+    const int32x2_t vsum0xAB = vpadd_s32(vpsum0xA, vpsum0xB);
+    int32x4_t vacc0x89AB = vcombine_s32(vsum0x89, vsum0xAB );
+    const int32x2_t vpsum0xC = vadd_s32(vget_low_s32(vacc0x12), vget_high_s32(vacc0x12));
+    const int32x2_t vpsum0xD = vadd_s32(vget_low_s32(vacc0x13), vget_high_s32(vacc0x13));
+    const int32x2_t vpsum0xE = vadd_s32(vget_low_s32(vacc0x14), vget_high_s32(vacc0x14));
+    const int32x2_t vpsum0xF = vadd_s32(vget_low_s32(vacc0x15), vget_high_s32(vacc0x15));
+    const int32x2_t vsum0xCD = vpadd_s32(vpsum0xC, vpsum0xD);
+    const int32x2_t vsum0xEF = vpadd_s32(vpsum0xE, vpsum0xF);
+    int32x4_t vacc0xCDEF = vcombine_s32(vsum0xCD, vsum0xEF );
+#endif
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+    vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+    vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+
+    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+
+    if (nc >= 16) {
+      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 16;
+    } else {
+      int8x8_t vout0x01234567 = vget_low_s8(vout0x0123456789ABCDEF);
+      if (nc & 8) {
+        vst1_s8(c0, vout0x01234567); c0 += 8;
+        vout0x01234567 = vget_high_s8(vout0x0123456789ABCDEF);
+      }
+      if (nc & 4) {
+        vst1_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpret_u32_s8(vout0x01234567), 0); c0 += 4;
+        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpret_u16_s8(vout0x01234567), 0); c0 += 2;
+        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1_lane_s8(c0, vout0x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-sse2-ld128.c b/src/qs8-igemm/gen/1x4c2-minmax-sse2-ld128.c
index 42dc29e..cdff4d5 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-sse2-ld128.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-sse2-ld128.c
@@ -12,6 +12,7 @@
 #include <emmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x4c2__sse2_ld128(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   int8_t* c0 = c;
 
   do {
@@ -110,15 +112,6 @@
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-            if (k > 6 * sizeof(int8_t)) {
-              const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-              w = (const void*) ((uintptr_t) w + 8);
-              const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-
-              vacc0x0123 = _mm_add_epi32(vacc0x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            }
           }
         }
       }
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-sse2-ld64.c b/src/qs8-igemm/gen/1x4c2-minmax-sse2-ld64.c
index d78c425..bd6ce42 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-sse2-ld64.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-sse2-ld64.c
@@ -12,6 +12,7 @@
 #include <emmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x4c2__sse2_ld64(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   int8_t* c0 = c;
 
   do {
@@ -110,15 +112,6 @@
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-            if (k > 6 * sizeof(int8_t)) {
-              const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-              w = (const void*) ((uintptr_t) w + 8);
-              const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-
-              vacc0x0123 = _mm_add_epi32(vacc0x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            }
           }
         }
       }
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-sse41-ld128.c b/src/qs8-igemm/gen/1x4c2-minmax-sse41-ld128.c
index 0d5b1dc..312afa9 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-sse41-ld128.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-sse41-ld128.c
@@ -12,6 +12,7 @@
 #include <smmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   int8_t* c0 = c;
 
   do {
@@ -110,15 +112,6 @@
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-            if (k > 6 * sizeof(int8_t)) {
-              const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-              w = (const void*) ((uintptr_t) w + 8);
-              const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-
-              vacc0x0123 = _mm_add_epi32(vacc0x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            }
           }
         }
       }
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-sse41-ld64.c b/src/qs8-igemm/gen/1x4c2-minmax-sse41-ld64.c
index 2827d03..af2b116 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-sse41-ld64.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-sse41-ld64.c
@@ -12,6 +12,7 @@
 #include <smmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   int8_t* c0 = c;
 
   do {
@@ -110,15 +112,6 @@
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-            if (k > 6 * sizeof(int8_t)) {
-              const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-              w = (const void*) ((uintptr_t) w + 8);
-              const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-
-              vacc0x0123 = _mm_add_epi32(vacc0x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            }
           }
         }
       }
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-ssse3-ld128.c b/src/qs8-igemm/gen/1x4c2-minmax-ssse3-ld128.c
index b9aba97..5f308dd 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-ssse3-ld128.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-ssse3-ld128.c
@@ -12,6 +12,7 @@
 #include <tmmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x4c2__ssse3_ld128(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   int8_t* c0 = c;
 
   do {
@@ -110,15 +112,6 @@
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-            if (k > 6 * sizeof(int8_t)) {
-              const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-              w = (const void*) ((uintptr_t) w + 8);
-              const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-
-              vacc0x0123 = _mm_add_epi32(vacc0x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            }
           }
         }
       }
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-ssse3-ld64.c b/src/qs8-igemm/gen/1x4c2-minmax-ssse3-ld64.c
index 862a3f6..ff8241a 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-ssse3-ld64.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-ssse3-ld64.c
@@ -12,6 +12,7 @@
 #include <tmmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x4c2__ssse3_ld64(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   int8_t* c0 = c;
 
   do {
@@ -110,15 +112,6 @@
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-            if (k > 6 * sizeof(int8_t)) {
-              const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-              w = (const void*) ((uintptr_t) w + 8);
-              const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-
-              vacc0x0123 = _mm_add_epi32(vacc0x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            }
           }
         }
       }
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-xop-ld128.c b/src/qs8-igemm/gen/1x4c2-minmax-xop-ld128.c
index 5882450..fe4d345 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-xop-ld128.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-xop-ld128.c
@@ -17,6 +17,7 @@
 #endif
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128(
@@ -44,6 +45,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   int8_t* c0 = c;
 
   do {
@@ -115,15 +117,6 @@
 
             vacc0x0123 = _mm_maddd_epi16(
               _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
-
-            if (k > 6 * sizeof(int8_t)) {
-              const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-              w = (const void*) ((uintptr_t) w + 8);
-              const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-
-              vacc0x0123 = _mm_maddd_epi16(
-                _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc0x0123);
-            }
           }
         }
       }
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-xop-ld64.c b/src/qs8-igemm/gen/1x4c2-minmax-xop-ld64.c
index 5aafdf0..7417f8e 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-xop-ld64.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-xop-ld64.c
@@ -17,6 +17,7 @@
 #endif
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64(
@@ -44,6 +45,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   int8_t* c0 = c;
 
   do {
@@ -115,15 +117,6 @@
 
             vacc0x0123 = _mm_maddd_epi16(
               _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
-
-            if (k > 6 * sizeof(int8_t)) {
-              const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-              w = (const void*) ((uintptr_t) w + 8);
-              const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-
-              vacc0x0123 = _mm_maddd_epi16(
-                _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc0x0123);
-            }
           }
         }
       }
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-sse2-ld128.c b/src/qs8-igemm/gen/1x4c8-minmax-sse2-ld128.c
index fe9eb8c..6837212 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-sse2-ld128.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-sse2-ld128.c
@@ -12,6 +12,7 @@
 #include <emmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x4c8__sse2_ld128(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
 
   do {
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-sse2-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-sse2-ld64.c
index 2dd1d31..fc91529 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-sse2-ld64.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-sse2-ld64.c
@@ -12,6 +12,7 @@
 #include <emmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x4c8__sse2_ld64(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
 
   do {
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-sse41-ld128.c b/src/qs8-igemm/gen/1x4c8-minmax-sse41-ld128.c
index 10b9d02..66f8bbb 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-sse41-ld128.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-sse41-ld128.c
@@ -12,6 +12,7 @@
 #include <smmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
 
   do {
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-sse41-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-sse41-ld64.c
index c171ead..4031b08 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-sse41-ld64.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-sse41-ld64.c
@@ -12,6 +12,7 @@
 #include <smmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
 
   do {
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-ssse3-ld128.c b/src/qs8-igemm/gen/1x4c8-minmax-ssse3-ld128.c
index 561d72f..b04f5bf 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-ssse3-ld128.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-ssse3-ld128.c
@@ -12,6 +12,7 @@
 #include <tmmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x4c8__ssse3_ld128(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
 
   do {
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-ssse3-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-ssse3-ld64.c
index f95561d..6036ab6 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-ssse3-ld64.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-ssse3-ld64.c
@@ -12,6 +12,7 @@
 #include <tmmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x4c8__ssse3_ld64(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
 
   do {
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-wasmsimd-ld128.c b/src/qs8-igemm/gen/1x4c8-minmax-wasmsimd-ld128.c
index 8a104c0..9af046a 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-wasmsimd-ld128.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-wasmsimd-ld128.c
@@ -12,6 +12,7 @@
 #include <wasm_simd128.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x4c8__wasmsimd_ld128(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
 
   const v128_t vzero = wasm_f64x2_splat(0.0);
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-wasmsimd-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-wasmsimd-ld64.c
index 15bc8bb..faadba9 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-wasmsimd-ld64.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-wasmsimd-ld64.c
@@ -12,6 +12,7 @@
 #include <wasm_simd128.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x4c8__wasmsimd_ld64(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
 
   const v128_t vzero = wasm_f64x2_splat(0.0);
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-xop-ld128.c b/src/qs8-igemm/gen/1x4c8-minmax-xop-ld128.c
index 0686d4e..aa7eca9 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-xop-ld128.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-xop-ld128.c
@@ -17,6 +17,7 @@
 #endif
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128(
@@ -44,6 +45,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
 
   do {
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-xop-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-xop-ld64.c
index fb46c83..1204750 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-xop-ld64.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-xop-ld64.c
@@ -17,6 +17,7 @@
 #endif
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64(
@@ -44,6 +45,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
 
   do {
diff --git a/src/qs8-igemm/gen/1x8-minmax-neon-mull-addw-dup.c b/src/qs8-igemm/gen/1x8-minmax-neon-mull-addw-dup.c
new file mode 100644
index 0000000..bcf3557
--- /dev/null
+++ b/src/qs8-igemm/gen/1x8-minmax-neon-mull-addw-dup.c
@@ -0,0 +1,217 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/neon-mull-addw-dup.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/gemm.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (1 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  int8_t* c0 = c;
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      a += 1;
+
+      size_t k = kc;
+      while (k >= 8 * sizeof(int8_t)) {
+        const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+
+        const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c0));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c0));
+        const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c1));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c1));
+        const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va0, 2));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c2));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c2));
+        const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3));
+        const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va0, 4));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c4));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c4));
+        const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va0, 5));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c5));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c5));
+        const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va0, 6));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c6));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c6));
+        const int8x8_t vb01234567c7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va0, 7));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c7));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c7));
+
+        k -= 8 * sizeof(int8_t);
+      }
+      if XNN_UNLIKELY(k != 0) {
+        const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+
+        const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c0));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c0));
+
+        if (k >= 2 * sizeof(int8_t)) {
+          const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1));
+          vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c1));
+          vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c1));
+
+          if (k > 2 * sizeof(int8_t)) {
+            const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+            const int16x8_t vprod0x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va0, 2));
+            vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c2));
+            vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c2));
+
+            if (k >= 4 * sizeof(int8_t)) {
+              const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+              const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3));
+              vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3));
+              vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3));
+
+              if (k > 4 * sizeof(int8_t)) {
+                const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+                const int16x8_t vprod0x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va0, 4));
+                vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c4));
+                vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c4));
+
+                if (k >= 6 * sizeof(int8_t)) {
+                  const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+                  const int16x8_t vprod0x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va0, 5));
+                  vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c5));
+                  vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c5));
+
+                  if (k > 6 * sizeof(int8_t)) {
+                    const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+                    const int16x8_t vprod0x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va0, 6));
+                    vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c6));
+                    vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c6));
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+      p -= 1 * sizeof(void*);
+    } while (p != 0);
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+
+    int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+
+    int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567);
+#endif
+    const int8x8_t voutput_min = vld1_dup_s8(&params->neon.output_min);
+    const int8x8_t voutput_max = vld1_dup_s8(&params->neon.output_max);
+
+    vout0x01234567 = vmax_s8(vout0x01234567, voutput_min);
+
+    vout0x01234567 = vmin_s8(vout0x01234567, voutput_max);
+
+    if (nc >= 8) {
+      vst1_s8(c0 + 0, vout0x01234567);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        vst1_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpret_u32_s8(vout0x01234567), 0); c0 += 4;
+        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpret_u16_s8(vout0x01234567), 0); c0 += 2;
+        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1_lane_s8(c0, vout0x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/1x8c16-minmax-neon-mlal-padal.c b/src/qs8-igemm/gen/1x8c16-minmax-neon-mlal-padal.c
new file mode 100644
index 0000000..e81fbe7
--- /dev/null
+++ b/src/qs8-igemm/gen/1x8c16-minmax-neon-mlal-padal.c
@@ -0,0 +1,185 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/c16-neon-mlal-padal.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_1x8c16__neon_mlal_padal(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (1 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 16);
+  int8_t* c0 = c;
+
+  do {
+    int32x4_t vacc0x0 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x1 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x2 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x3 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x4 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x5 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      a += 1;
+
+      // KC loop of 16 with up to 15 remainder
+      size_t k = 0;
+      while (k < kc) {
+        const int8x16_t va0 = vld1q_s8(a0); a0 += 16;
+
+        const int8x16_t vb0 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb1 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb2 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb3 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb4 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb5 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb6 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb7 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+
+        int16x8_t vprod0x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va0));
+        vprod0x0 = vmlal_s8(vprod0x0, vget_high_s8(vb0), vget_high_s8(va0));
+        vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+        int16x8_t vprod0x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va0));
+        vprod0x1 = vmlal_s8(vprod0x1, vget_high_s8(vb1), vget_high_s8(va0));
+        vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+        int16x8_t vprod0x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va0));
+        vprod0x2 = vmlal_s8(vprod0x2, vget_high_s8(vb2), vget_high_s8(va0));
+        vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+        int16x8_t vprod0x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va0));
+        vprod0x3 = vmlal_s8(vprod0x3, vget_high_s8(vb3), vget_high_s8(va0));
+        vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+        int16x8_t vprod0x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va0));
+        vprod0x4 = vmlal_s8(vprod0x4, vget_high_s8(vb4), vget_high_s8(va0));
+        vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+        int16x8_t vprod0x5 = vmull_s8(vget_low_s8(vb5), vget_low_s8(va0));
+        vprod0x5 = vmlal_s8(vprod0x5, vget_high_s8(vb5), vget_high_s8(va0));
+        vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+        int16x8_t vprod0x6 = vmull_s8(vget_low_s8(vb6), vget_low_s8(va0));
+        vprod0x6 = vmlal_s8(vprod0x6, vget_high_s8(vb6), vget_high_s8(va0));
+        vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+        int16x8_t vprod0x7 = vmull_s8(vget_low_s8(vb7), vget_low_s8(va0));
+        vprod0x7 = vmlal_s8(vprod0x7, vget_high_s8(vb7), vget_high_s8(va0));
+        vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+
+        k += 16 * sizeof(int8_t);
+      }
+
+      p -= 1 * sizeof(void*);
+    } while (p != 0);
+
+#if XNN_ARCH_ARM64
+    const int32x4_t vsum0x01 = vpaddq_s32(vacc0x0, vacc0x1);
+    const int32x4_t vsum0x23 = vpaddq_s32(vacc0x2, vacc0x3);
+    const int32x4_t vsum0x45 = vpaddq_s32(vacc0x4, vacc0x5);
+    const int32x4_t vsum0x67 = vpaddq_s32(vacc0x6, vacc0x7);
+    int32x4_t vacc0x0123 = vpaddq_s32(vsum0x01, vsum0x23);
+    int32x4_t vacc0x4567 = vpaddq_s32(vsum0x45, vsum0x67);
+#else
+    const int32x2_t vpsum0x0 = vadd_s32(vget_low_s32(vacc0x0), vget_high_s32(vacc0x0));
+    const int32x2_t vpsum0x1 = vadd_s32(vget_low_s32(vacc0x1), vget_high_s32(vacc0x1));
+    const int32x2_t vpsum0x2 = vadd_s32(vget_low_s32(vacc0x2), vget_high_s32(vacc0x2));
+    const int32x2_t vpsum0x3 = vadd_s32(vget_low_s32(vacc0x3), vget_high_s32(vacc0x3));
+    const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1);
+    const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3);
+    int32x4_t vacc0x0123 = vcombine_s32(vsum0x01, vsum0x23 );
+    const int32x2_t vpsum0x4 = vadd_s32(vget_low_s32(vacc0x4), vget_high_s32(vacc0x4));
+    const int32x2_t vpsum0x5 = vadd_s32(vget_low_s32(vacc0x5), vget_high_s32(vacc0x5));
+    const int32x2_t vpsum0x6 = vadd_s32(vget_low_s32(vacc0x6), vget_high_s32(vacc0x6));
+    const int32x2_t vpsum0x7 = vadd_s32(vget_low_s32(vacc0x7), vget_high_s32(vacc0x7));
+    const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5);
+    const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7);
+    int32x4_t vacc0x4567 = vcombine_s32(vsum0x45, vsum0x67 );
+#endif
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+
+    int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567);
+#endif
+    const int8x8_t voutput_min = vld1_dup_s8(&params->neon.output_min);
+    const int8x8_t voutput_max = vld1_dup_s8(&params->neon.output_max);
+
+    vout0x01234567 = vmax_s8(vout0x01234567, voutput_min);
+
+    vout0x01234567 = vmin_s8(vout0x01234567, voutput_max);
+
+    if (nc >= 8) {
+      vst1_s8(c0 + 0, vout0x01234567);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        vst1_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpret_u32_s8(vout0x01234567), 0); c0 += 4;
+        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpret_u16_s8(vout0x01234567), 0); c0 += 2;
+        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1_lane_s8(c0, vout0x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/1x8c2-minmax-neon-mlal-padal-dup.c b/src/qs8-igemm/gen/1x8c2-minmax-neon-mlal-padal-dup.c
new file mode 100644
index 0000000..ccd894e
--- /dev/null
+++ b/src/qs8-igemm/gen/1x8c2-minmax-neon-mlal-padal-dup.c
@@ -0,0 +1,228 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/c2-neon-mull-padal-dup.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (1 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  int8_t* c0 = c;
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      a += 1;
+
+      size_t k = kc;
+
+      while (k >= 16 * sizeof(int8_t)) {
+        const int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va0x1 = vld1_s8(a0); a0 += 8;
+
+        const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+        const int8x8_t vb0123c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+        int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+        const int8x8_t vb4567c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+        int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+        const int8x8_t vb0123c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x0123c1 = vmlal_s8(vprod0x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+        int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+        const int8x8_t vb4567c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x4567c1 = vmlal_s8(vprod0x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+        int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+        const int8x8_t vb0123c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x0123c2 = vmlal_s8(vprod0x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+        int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+        const int8x8_t vb4567c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x4567c2 = vmlal_s8(vprod0x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+        int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+        const int8x8_t vb0123c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x0123c3 = vmlal_s8(vprod0x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+        int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+        const int8x8_t vb4567c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x4567c3 = vmlal_s8(vprod0x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+
+        k -= 16 * sizeof(int8_t);
+      }
+
+      if (k >= 8 * sizeof(int8_t)) {
+        const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+
+        const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+        const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+        const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+        const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+
+        k -= 8 * sizeof(int8_t);
+      }
+
+      if XNN_UNLIKELY(k != 0) {
+        const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+
+        const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+        const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+
+        if (k > 2 * sizeof(int8_t)) {
+          const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+          vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+          const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+          vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+
+          if (k > 4 * sizeof(int8_t)) {
+            const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+            const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+            vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+            const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+            vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+          }
+        }
+      }
+      p -= 1 * sizeof(void*);
+    } while (p != 0);
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+
+    int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+
+    int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567);
+#endif
+    const int8x8_t voutput_min = vld1_dup_s8(&params->neon.output_min);
+    const int8x8_t voutput_max = vld1_dup_s8(&params->neon.output_max);
+
+    vout0x01234567 = vmax_s8(vout0x01234567, voutput_min);
+
+    vout0x01234567 = vmin_s8(vout0x01234567, voutput_max);
+
+    if (nc >= 8) {
+      vst1_s8(c0 + 0, vout0x01234567);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        vst1_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpret_u32_s8(vout0x01234567), 0); c0 += 4;
+        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpret_u16_s8(vout0x01234567), 0); c0 += 2;
+        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1_lane_s8(c0, vout0x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/1x8c2-minmax-neon-mull-padal-dup.c b/src/qs8-igemm/gen/1x8c2-minmax-neon-mull-padal-dup.c
new file mode 100644
index 0000000..c2f45d1
--- /dev/null
+++ b/src/qs8-igemm/gen/1x8c2-minmax-neon-mull-padal-dup.c
@@ -0,0 +1,180 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/c2-neon-mull-padal-dup.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (1 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  int8_t* c0 = c;
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      a += 1;
+
+      size_t k = kc;
+
+
+      while (k >= 8 * sizeof(int8_t)) {
+        const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+
+        const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+        const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+        const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+        const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+
+        k -= 8 * sizeof(int8_t);
+      }
+
+      if XNN_UNLIKELY(k != 0) {
+        const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+
+        const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+        const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+
+        if (k > 2 * sizeof(int8_t)) {
+          const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+          vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+          const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+          vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+
+          if (k > 4 * sizeof(int8_t)) {
+            const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+            const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+            vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+            const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+            vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+          }
+        }
+      }
+      p -= 1 * sizeof(void*);
+    } while (p != 0);
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+
+    int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+
+    int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567);
+#endif
+    const int8x8_t voutput_min = vld1_dup_s8(&params->neon.output_min);
+    const int8x8_t voutput_max = vld1_dup_s8(&params->neon.output_max);
+
+    vout0x01234567 = vmax_s8(vout0x01234567, voutput_min);
+
+    vout0x01234567 = vmin_s8(vout0x01234567, voutput_max);
+
+    if (nc >= 8) {
+      vst1_s8(c0 + 0, vout0x01234567);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        vst1_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpret_u32_s8(vout0x01234567), 0); c0 += 4;
+        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpret_u16_s8(vout0x01234567), 0); c0 += 2;
+        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1_lane_s8(c0, vout0x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/1x8c4-minmax-neondot.c b/src/qs8-igemm/gen/1x8c4-minmax-neondot.c
index 2a885c4..dd311f3 100644
--- a/src/qs8-igemm/gen/1x8c4-minmax-neondot.c
+++ b/src/qs8-igemm/gen/1x8c4-minmax-neondot.c
@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x8c4__neondot(
@@ -40,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 4);
   int8_t* c0 = c;
 
   do {
@@ -62,10 +63,10 @@
         const int8x8_t va0x01234567 = vld1_s8(a0); a0 += 8;
 
         // Load a 8x8 block of weights.
-        const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
+        const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
 
         // Multiply-accumulate: 1x8 * 8x8 --> 1x8.
         vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb0123x0123, va0x01234567, 0);
@@ -75,28 +76,18 @@
 
         k -= 8 * sizeof(int8_t);
       }
-      // Handle up to 7 final positions of `k`
+      // Handle up to 4 final positions of `k`
       if XNN_UNLIKELY(k != 0) {
         // Load a 1x4 block of activations.
         const int8x8_t va0x01234567 = vld1_s8(a0);
 
         // Load a 4x8 block of weights.
-        const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
+        const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
 
         // Multiply-accumulate: 1x4 * 4x8 --> 1x8.
         vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb0123x0123, va0x01234567, 0);
         vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb0123x4567, va0x01234567, 0);
-
-        if (k > 4) {
-          // Load a 4x8 block of weights.
-          const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-          const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-
-          // Multiply-accumulate: 1x4 * 4x8 --> 1x8.
-          vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
-          vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
-        }
       }
       p -= 1 * sizeof(void*);
     } while (p != 0);
diff --git a/src/qs8-igemm/gen/1x8c8-minmax-avx2.c b/src/qs8-igemm/gen/1x8c8-minmax-avx2.c
index 4eaa400..4c41935 100644
--- a/src/qs8-igemm/gen/1x8c8-minmax-avx2.c
+++ b/src/qs8-igemm/gen/1x8c8-minmax-avx2.c
@@ -13,6 +13,7 @@
 
 #include <xnnpack/igemm.h>
 #include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2(
@@ -40,6 +41,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
 
   do {
diff --git a/src/qs8-igemm/gen/1x8c8-minmax-neon-mlal-padal.c b/src/qs8-igemm/gen/1x8c8-minmax-neon-mlal-padal.c
new file mode 100644
index 0000000..f2760a8
--- /dev/null
+++ b/src/qs8-igemm/gen/1x8c8-minmax-neon-mlal-padal.c
@@ -0,0 +1,226 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/c8-neon-mull-padal.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (1 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  int8_t* c0 = c;
+
+  do {
+    int32x4_t vacc0x0 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x1 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x2 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x3 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x4 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x5 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      a += 1;
+
+      size_t k = kc;
+      // 2x partial unrolled loop to load 16 bytes at a time using MLA.
+      while (k >= 16 * sizeof(int8_t)) {
+        const int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va0x1 = vld1_s8(a0); a0 += 8;
+
+        const int8x8_t vb0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb4x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb5x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb6x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb7x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+
+        const int8x8_t vb0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x0 = vmull_s8(vb0x0, va0x0);
+        vprod0x0 = vmlal_s8(vprod0x0, vb0x1, va0x1);
+        vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+        const int8x8_t vb1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x1 = vmull_s8(vb1x0, va0x0);
+        vprod0x1 = vmlal_s8(vprod0x1, vb1x1, va0x1);
+        vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+        const int8x8_t vb2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x2 = vmull_s8(vb2x0, va0x0);
+        vprod0x2 = vmlal_s8(vprod0x2, vb2x1, va0x1);
+        vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+        const int8x8_t vb3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x3 = vmull_s8(vb3x0, va0x0);
+        vprod0x3 = vmlal_s8(vprod0x3, vb3x1, va0x1);
+        vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+        const int8x8_t vb4x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x4 = vmull_s8(vb4x0, va0x0);
+        vprod0x4 = vmlal_s8(vprod0x4, vb4x1, va0x1);
+        vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+        const int8x8_t vb5x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x5 = vmull_s8(vb5x0, va0x0);
+        vprod0x5 = vmlal_s8(vprod0x5, vb5x1, va0x1);
+        vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+        const int8x8_t vb6x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x6 = vmull_s8(vb6x0, va0x0);
+        vprod0x6 = vmlal_s8(vprod0x6, vb6x1, va0x1);
+        vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+        const int8x8_t vb7x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x7 = vmull_s8(vb7x0, va0x0);
+        vprod0x7 = vmlal_s8(vprod0x7, vb7x1, va0x1);
+        vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+
+        k -= 16 * sizeof(int8_t);
+      }
+
+      // Handle 8 bytes at a time using MUL.
+      if (k > 0) {
+        const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+
+        const int8x8_t vb0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x0 = vmull_s8(vb0, va0);
+        vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+        const int8x8_t vb1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x1 = vmull_s8(vb1, va0);
+        vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+        const int8x8_t vb2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x2 = vmull_s8(vb2, va0);
+        vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+        const int8x8_t vb3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x3 = vmull_s8(vb3, va0);
+        vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+        const int8x8_t vb4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x4 = vmull_s8(vb4, va0);
+        vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+        const int8x8_t vb5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x5 = vmull_s8(vb5, va0);
+        vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+        const int8x8_t vb6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x6 = vmull_s8(vb6, va0);
+        vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+        const int8x8_t vb7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x7 = vmull_s8(vb7, va0);
+        vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+
+        k -= 8 * sizeof(int8_t);
+      }
+
+      p -= 1 * sizeof(void*);
+    } while (p != 0);
+
+#if XNN_ARCH_ARM64
+    const int32x4_t vsum0x01 = vpaddq_s32(vacc0x0, vacc0x1);
+    const int32x4_t vsum0x23 = vpaddq_s32(vacc0x2, vacc0x3);
+    const int32x4_t vsum0x45 = vpaddq_s32(vacc0x4, vacc0x5);
+    const int32x4_t vsum0x67 = vpaddq_s32(vacc0x6, vacc0x7);
+    int32x4_t vacc0x0123 = vpaddq_s32(vsum0x01, vsum0x23);
+    int32x4_t vacc0x4567 = vpaddq_s32(vsum0x45, vsum0x67);
+#else
+    const int32x2_t vpsum0x0 = vadd_s32(vget_low_s32(vacc0x0), vget_high_s32(vacc0x0));
+    const int32x2_t vpsum0x1 = vadd_s32(vget_low_s32(vacc0x1), vget_high_s32(vacc0x1));
+    const int32x2_t vpsum0x2 = vadd_s32(vget_low_s32(vacc0x2), vget_high_s32(vacc0x2));
+    const int32x2_t vpsum0x3 = vadd_s32(vget_low_s32(vacc0x3), vget_high_s32(vacc0x3));
+    const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1);
+    const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3);
+    int32x4_t vacc0x0123 = vcombine_s32(vsum0x01, vsum0x23 );
+    const int32x2_t vpsum0x4 = vadd_s32(vget_low_s32(vacc0x4), vget_high_s32(vacc0x4));
+    const int32x2_t vpsum0x5 = vadd_s32(vget_low_s32(vacc0x5), vget_high_s32(vacc0x5));
+    const int32x2_t vpsum0x6 = vadd_s32(vget_low_s32(vacc0x6), vget_high_s32(vacc0x6));
+    const int32x2_t vpsum0x7 = vadd_s32(vget_low_s32(vacc0x7), vget_high_s32(vacc0x7));
+    const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5);
+    const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7);
+    int32x4_t vacc0x4567 = vcombine_s32(vsum0x45, vsum0x67 );
+#endif
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+
+    int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567);
+#endif
+    const int8x8_t voutput_min = vld1_dup_s8(&params->neon.output_min);
+    const int8x8_t voutput_max = vld1_dup_s8(&params->neon.output_max);
+
+    vout0x01234567 = vmax_s8(vout0x01234567, voutput_min);
+
+    vout0x01234567 = vmin_s8(vout0x01234567, voutput_max);
+
+    if (nc >= 8) {
+      vst1_s8(c0 + 0, vout0x01234567);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        vst1_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpret_u32_s8(vout0x01234567), 0); c0 += 4;
+        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpret_u16_s8(vout0x01234567), 0); c0 += 2;
+        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1_lane_s8(c0, vout0x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/1x8c8-minmax-neon-mull-padal.c b/src/qs8-igemm/gen/1x8c8-minmax-neon-mull-padal.c
new file mode 100644
index 0000000..b853ca4
--- /dev/null
+++ b/src/qs8-igemm/gen/1x8c8-minmax-neon-mull-padal.c
@@ -0,0 +1,177 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/c8-neon-mull-padal.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mull_padal(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (1 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  int8_t* c0 = c;
+
+  do {
+    int32x4_t vacc0x0 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x1 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x2 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x3 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x4 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x5 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      a += 1;
+
+      size_t k = kc;
+
+      // Handle 8 bytes at a time using MUL.
+      while (k > 0) {
+        const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+
+        const int8x8_t vb0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x0 = vmull_s8(vb0, va0);
+        vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+        const int8x8_t vb1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x1 = vmull_s8(vb1, va0);
+        vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+        const int8x8_t vb2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x2 = vmull_s8(vb2, va0);
+        vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+        const int8x8_t vb3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x3 = vmull_s8(vb3, va0);
+        vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+        const int8x8_t vb4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x4 = vmull_s8(vb4, va0);
+        vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+        const int8x8_t vb5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x5 = vmull_s8(vb5, va0);
+        vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+        const int8x8_t vb6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x6 = vmull_s8(vb6, va0);
+        vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+        const int8x8_t vb7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x7 = vmull_s8(vb7, va0);
+        vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+
+        k -= 8 * sizeof(int8_t);
+      }
+
+      p -= 1 * sizeof(void*);
+    } while (p != 0);
+
+#if XNN_ARCH_ARM64
+    const int32x4_t vsum0x01 = vpaddq_s32(vacc0x0, vacc0x1);
+    const int32x4_t vsum0x23 = vpaddq_s32(vacc0x2, vacc0x3);
+    const int32x4_t vsum0x45 = vpaddq_s32(vacc0x4, vacc0x5);
+    const int32x4_t vsum0x67 = vpaddq_s32(vacc0x6, vacc0x7);
+    int32x4_t vacc0x0123 = vpaddq_s32(vsum0x01, vsum0x23);
+    int32x4_t vacc0x4567 = vpaddq_s32(vsum0x45, vsum0x67);
+#else
+    const int32x2_t vpsum0x0 = vadd_s32(vget_low_s32(vacc0x0), vget_high_s32(vacc0x0));
+    const int32x2_t vpsum0x1 = vadd_s32(vget_low_s32(vacc0x1), vget_high_s32(vacc0x1));
+    const int32x2_t vpsum0x2 = vadd_s32(vget_low_s32(vacc0x2), vget_high_s32(vacc0x2));
+    const int32x2_t vpsum0x3 = vadd_s32(vget_low_s32(vacc0x3), vget_high_s32(vacc0x3));
+    const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1);
+    const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3);
+    int32x4_t vacc0x0123 = vcombine_s32(vsum0x01, vsum0x23 );
+    const int32x2_t vpsum0x4 = vadd_s32(vget_low_s32(vacc0x4), vget_high_s32(vacc0x4));
+    const int32x2_t vpsum0x5 = vadd_s32(vget_low_s32(vacc0x5), vget_high_s32(vacc0x5));
+    const int32x2_t vpsum0x6 = vadd_s32(vget_low_s32(vacc0x6), vget_high_s32(vacc0x6));
+    const int32x2_t vpsum0x7 = vadd_s32(vget_low_s32(vacc0x7), vget_high_s32(vacc0x7));
+    const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5);
+    const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7);
+    int32x4_t vacc0x4567 = vcombine_s32(vsum0x45, vsum0x67 );
+#endif
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+
+    int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567);
+#endif
+    const int8x8_t voutput_min = vld1_dup_s8(&params->neon.output_min);
+    const int8x8_t voutput_max = vld1_dup_s8(&params->neon.output_max);
+
+    vout0x01234567 = vmax_s8(vout0x01234567, voutput_min);
+
+    vout0x01234567 = vmin_s8(vout0x01234567, voutput_max);
+
+    if (nc >= 8) {
+      vst1_s8(c0 + 0, vout0x01234567);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        vst1_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpret_u32_s8(vout0x01234567), 0); c0 += 4;
+        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpret_u16_s8(vout0x01234567), 0); c0 += 2;
+        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1_lane_s8(c0, vout0x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/2x16-minmax-neon-mull-addw-dup.c b/src/qs8-igemm/gen/2x16-minmax-neon-mull-addw-dup.c
new file mode 100644
index 0000000..b4ca64f
--- /dev/null
+++ b/src/qs8-igemm/gen/2x16-minmax-neon-mull-addw-dup.c
@@ -0,0 +1,430 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/neon-mull-addw-dup.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/gemm.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (2 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    c1 = c0;
+  }
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc1x0123 = vacc0x0123;
+    int32x4_t vacc1x4567 = vacc0x4567;
+    int32x4_t vacc1x89AB = vacc0x89AB;
+    int32x4_t vacc1xCDEF = vacc0xCDEF;
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      a += 2;
+
+      size_t k = kc;
+      while (k >= 8 * sizeof(int8_t)) {
+        const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+
+        const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c0));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c0));
+        const int16x8_t vprod1x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va1, 0));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c0));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c0));
+        const int8x8_t vb89ABCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va0, 0));
+        vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc0));
+        vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc0));
+        const int16x8_t vprod1x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va1, 0));
+        vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc0));
+        vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc0));
+        const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c1));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c1));
+        const int16x8_t vprod1x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va1, 1));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c1));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c1));
+        const int8x8_t vb89ABCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va0, 1));
+        vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc1));
+        vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc1));
+        const int16x8_t vprod1x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va1, 1));
+        vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc1));
+        vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc1));
+        const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va0, 2));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c2));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c2));
+        const int16x8_t vprod1x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va1, 2));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c2));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c2));
+        const int8x8_t vb89ABCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va0, 2));
+        vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc2));
+        vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc2));
+        const int16x8_t vprod1x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va1, 2));
+        vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc2));
+        vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc2));
+        const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3));
+        const int16x8_t vprod1x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va1, 3));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c3));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c3));
+        const int8x8_t vb89ABCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va0, 3));
+        vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc3));
+        vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc3));
+        const int16x8_t vprod1x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va1, 3));
+        vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc3));
+        vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc3));
+        const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va0, 4));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c4));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c4));
+        const int16x8_t vprod1x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va1, 4));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c4));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c4));
+        const int8x8_t vb89ABCDEFc4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va0, 4));
+        vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc4));
+        vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc4));
+        const int16x8_t vprod1x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va1, 4));
+        vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc4));
+        vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc4));
+        const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va0, 5));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c5));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c5));
+        const int16x8_t vprod1x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va1, 5));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c5));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c5));
+        const int8x8_t vb89ABCDEFc5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va0, 5));
+        vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc5));
+        vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc5));
+        const int16x8_t vprod1x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va1, 5));
+        vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc5));
+        vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc5));
+        const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va0, 6));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c6));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c6));
+        const int16x8_t vprod1x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va1, 6));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c6));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c6));
+        const int8x8_t vb89ABCDEFc6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va0, 6));
+        vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc6));
+        vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc6));
+        const int16x8_t vprod1x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va1, 6));
+        vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc6));
+        vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc6));
+        const int8x8_t vb01234567c7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va0, 7));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c7));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c7));
+        const int16x8_t vprod1x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va1, 7));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c7));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c7));
+        const int8x8_t vb89ABCDEFc7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x89ABCDEFc7 = vmull_s8(vb89ABCDEFc7, vdup_lane_s8(va0, 7));
+        vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc7));
+        vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc7));
+        const int16x8_t vprod1x89ABCDEFc7 = vmull_s8(vb89ABCDEFc7, vdup_lane_s8(va1, 7));
+        vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc7));
+        vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc7));
+
+        k -= 8 * sizeof(int8_t);
+      }
+      if XNN_UNLIKELY(k != 0) {
+        const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+        const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+
+        const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c0));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c0));
+        const int16x8_t vprod0x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va0, 0));
+        vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc0));
+        vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc0));
+        const int16x8_t vprod1x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va1, 0));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c0));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c0));
+        const int16x8_t vprod1x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va1, 0));
+        vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc0));
+        vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc0));
+
+        if (k >= 2 * sizeof(int8_t)) {
+          const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb89ABCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1));
+          vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c1));
+          vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c1));
+          const int16x8_t vprod0x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va0, 1));
+          vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc1));
+          vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc1));
+          const int16x8_t vprod1x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va1, 1));
+          vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c1));
+          vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c1));
+          const int16x8_t vprod1x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va1, 1));
+          vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc1));
+          vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc1));
+
+          if (k > 2 * sizeof(int8_t)) {
+            const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            const int8x8_t vb89ABCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+            const int16x8_t vprod0x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va0, 2));
+            vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c2));
+            vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c2));
+            const int16x8_t vprod0x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va0, 2));
+            vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc2));
+            vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc2));
+            const int16x8_t vprod1x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va1, 2));
+            vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c2));
+            vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c2));
+            const int16x8_t vprod1x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va1, 2));
+            vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc2));
+            vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc2));
+
+            if (k >= 4 * sizeof(int8_t)) {
+              const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+              const int8x8_t vb89ABCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+              const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3));
+              vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3));
+              vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3));
+              const int16x8_t vprod0x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va0, 3));
+              vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc3));
+              vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc3));
+              const int16x8_t vprod1x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va1, 3));
+              vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c3));
+              vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c3));
+              const int16x8_t vprod1x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va1, 3));
+              vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc3));
+              vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc3));
+
+              if (k > 4 * sizeof(int8_t)) {
+                const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+                const int8x8_t vb89ABCDEFc4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+                const int16x8_t vprod0x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va0, 4));
+                vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c4));
+                vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c4));
+                const int16x8_t vprod0x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va0, 4));
+                vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc4));
+                vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc4));
+                const int16x8_t vprod1x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va1, 4));
+                vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c4));
+                vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c4));
+                const int16x8_t vprod1x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va1, 4));
+                vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc4));
+                vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc4));
+
+                if (k >= 6 * sizeof(int8_t)) {
+                  const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+                  const int8x8_t vb89ABCDEFc5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+                  const int16x8_t vprod0x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va0, 5));
+                  vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c5));
+                  vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c5));
+                  const int16x8_t vprod0x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va0, 5));
+                  vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc5));
+                  vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc5));
+                  const int16x8_t vprod1x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va1, 5));
+                  vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c5));
+                  vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c5));
+                  const int16x8_t vprod1x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va1, 5));
+                  vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc5));
+                  vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc5));
+
+                  if (k > 6 * sizeof(int8_t)) {
+                    const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+                    const int8x8_t vb89ABCDEFc6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+                    const int16x8_t vprod0x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va0, 6));
+                    vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c6));
+                    vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c6));
+                    const int16x8_t vprod0x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va0, 6));
+                    vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc6));
+                    vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc6));
+                    const int16x8_t vprod1x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va1, 6));
+                    vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c6));
+                    vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c6));
+                    const int16x8_t vprod1x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va1, 6));
+                    vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc6));
+                    vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc6));
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+      p -= 2 * sizeof(void*);
+    } while (p != 0);
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+    vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc1x89AB = vqrdmulhq_s32(vacc1x89AB, vmultiplier);
+    vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+    vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc1x89AB = vsraq_n_s32(vacc1x89AB, vbicq_s32(vacc1x89AB, vzero_shift_mask), 31);
+    vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_shift);
+    vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+    int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+    int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
+    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+
+    vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
+    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+
+    if (nc >= 16) {
+      vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
+      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 16;
+    } else {
+      int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
+      if (nc & 8) {
+        vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
+        vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
+        vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
+      }
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/2x16c16-minmax-neon-mlal-padal.c b/src/qs8-igemm/gen/2x16c16-minmax-neon-mlal-padal.c
new file mode 100644
index 0000000..db2455b
--- /dev/null
+++ b/src/qs8-igemm/gen/2x16c16-minmax-neon-mlal-padal.c
@@ -0,0 +1,397 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/c16-neon-mlal-padal.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (2 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 16);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    c1 = c0;
+  }
+
+  do {
+    int32x4_t vacc0x0 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x1 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x2 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x3 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x4 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x5 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x8 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x9 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x10 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x11 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x12 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x13 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x14 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x15 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc1x0 = vacc0x0;
+    int32x4_t vacc1x1 = vacc0x1;
+    int32x4_t vacc1x2 = vacc0x2;
+    int32x4_t vacc1x3 = vacc0x3;
+    int32x4_t vacc1x4 = vacc0x4;
+    int32x4_t vacc1x5 = vacc0x5;
+    int32x4_t vacc1x6 = vacc0x6;
+    int32x4_t vacc1x7 = vacc0x7;
+    int32x4_t vacc1x8 = vacc0x8;
+    int32x4_t vacc1x9 = vacc0x9;
+    int32x4_t vacc1x10 = vacc0x10;
+    int32x4_t vacc1x11 = vacc0x11;
+    int32x4_t vacc1x12 = vacc0x12;
+    int32x4_t vacc1x13 = vacc0x13;
+    int32x4_t vacc1x14 = vacc0x14;
+    int32x4_t vacc1x15 = vacc0x15;
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      a += 2;
+
+      // KC loop of 16 with up to 15 remainder
+      size_t k = 0;
+      while (k < kc) {
+        const int8x16_t va0 = vld1q_s8(a0); a0 += 16;
+        const int8x16_t va1 = vld1q_s8(a1); a1 += 16;
+
+        const int8x16_t vb0 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb1 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb2 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb3 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb4 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb5 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb6 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb7 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb8 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb9 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb10 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb11 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb12 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb13 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb14 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb15 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+
+        int16x8_t vprod0x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va0));
+        int16x8_t vprod1x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va1));
+        vprod0x0 = vmlal_s8(vprod0x0, vget_high_s8(vb0), vget_high_s8(va0));
+        vprod1x0 = vmlal_s8(vprod1x0, vget_high_s8(vb0), vget_high_s8(va1));
+        vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+        vacc1x0 = vpadalq_s16(vacc1x0, vprod1x0);
+        int16x8_t vprod0x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va0));
+        int16x8_t vprod1x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va1));
+        vprod0x1 = vmlal_s8(vprod0x1, vget_high_s8(vb1), vget_high_s8(va0));
+        vprod1x1 = vmlal_s8(vprod1x1, vget_high_s8(vb1), vget_high_s8(va1));
+        vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+        vacc1x1 = vpadalq_s16(vacc1x1, vprod1x1);
+        int16x8_t vprod0x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va0));
+        int16x8_t vprod1x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va1));
+        vprod0x2 = vmlal_s8(vprod0x2, vget_high_s8(vb2), vget_high_s8(va0));
+        vprod1x2 = vmlal_s8(vprod1x2, vget_high_s8(vb2), vget_high_s8(va1));
+        vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+        vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2);
+        int16x8_t vprod0x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va0));
+        int16x8_t vprod1x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va1));
+        vprod0x3 = vmlal_s8(vprod0x3, vget_high_s8(vb3), vget_high_s8(va0));
+        vprod1x3 = vmlal_s8(vprod1x3, vget_high_s8(vb3), vget_high_s8(va1));
+        vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+        vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3);
+        int16x8_t vprod0x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va0));
+        int16x8_t vprod1x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va1));
+        vprod0x4 = vmlal_s8(vprod0x4, vget_high_s8(vb4), vget_high_s8(va0));
+        vprod1x4 = vmlal_s8(vprod1x4, vget_high_s8(vb4), vget_high_s8(va1));
+        vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+        vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4);
+        int16x8_t vprod0x5 = vmull_s8(vget_low_s8(vb5), vget_low_s8(va0));
+        int16x8_t vprod1x5 = vmull_s8(vget_low_s8(vb5), vget_low_s8(va1));
+        vprod0x5 = vmlal_s8(vprod0x5, vget_high_s8(vb5), vget_high_s8(va0));
+        vprod1x5 = vmlal_s8(vprod1x5, vget_high_s8(vb5), vget_high_s8(va1));
+        vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+        vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5);
+        int16x8_t vprod0x6 = vmull_s8(vget_low_s8(vb6), vget_low_s8(va0));
+        int16x8_t vprod1x6 = vmull_s8(vget_low_s8(vb6), vget_low_s8(va1));
+        vprod0x6 = vmlal_s8(vprod0x6, vget_high_s8(vb6), vget_high_s8(va0));
+        vprod1x6 = vmlal_s8(vprod1x6, vget_high_s8(vb6), vget_high_s8(va1));
+        vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+        vacc1x6 = vpadalq_s16(vacc1x6, vprod1x6);
+        int16x8_t vprod0x7 = vmull_s8(vget_low_s8(vb7), vget_low_s8(va0));
+        int16x8_t vprod1x7 = vmull_s8(vget_low_s8(vb7), vget_low_s8(va1));
+        vprod0x7 = vmlal_s8(vprod0x7, vget_high_s8(vb7), vget_high_s8(va0));
+        vprod1x7 = vmlal_s8(vprod1x7, vget_high_s8(vb7), vget_high_s8(va1));
+        vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+        vacc1x7 = vpadalq_s16(vacc1x7, vprod1x7);
+        int16x8_t vprod0x8 = vmull_s8(vget_low_s8(vb8), vget_low_s8(va0));
+        int16x8_t vprod1x8 = vmull_s8(vget_low_s8(vb8), vget_low_s8(va1));
+        vprod0x8 = vmlal_s8(vprod0x8, vget_high_s8(vb8), vget_high_s8(va0));
+        vprod1x8 = vmlal_s8(vprod1x8, vget_high_s8(vb8), vget_high_s8(va1));
+        vacc0x8 = vpadalq_s16(vacc0x8, vprod0x8);
+        vacc1x8 = vpadalq_s16(vacc1x8, vprod1x8);
+        int16x8_t vprod0x9 = vmull_s8(vget_low_s8(vb9), vget_low_s8(va0));
+        int16x8_t vprod1x9 = vmull_s8(vget_low_s8(vb9), vget_low_s8(va1));
+        vprod0x9 = vmlal_s8(vprod0x9, vget_high_s8(vb9), vget_high_s8(va0));
+        vprod1x9 = vmlal_s8(vprod1x9, vget_high_s8(vb9), vget_high_s8(va1));
+        vacc0x9 = vpadalq_s16(vacc0x9, vprod0x9);
+        vacc1x9 = vpadalq_s16(vacc1x9, vprod1x9);
+        int16x8_t vprod0x10 = vmull_s8(vget_low_s8(vb10), vget_low_s8(va0));
+        int16x8_t vprod1x10 = vmull_s8(vget_low_s8(vb10), vget_low_s8(va1));
+        vprod0x10 = vmlal_s8(vprod0x10, vget_high_s8(vb10), vget_high_s8(va0));
+        vprod1x10 = vmlal_s8(vprod1x10, vget_high_s8(vb10), vget_high_s8(va1));
+        vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10);
+        vacc1x10 = vpadalq_s16(vacc1x10, vprod1x10);
+        int16x8_t vprod0x11 = vmull_s8(vget_low_s8(vb11), vget_low_s8(va0));
+        int16x8_t vprod1x11 = vmull_s8(vget_low_s8(vb11), vget_low_s8(va1));
+        vprod0x11 = vmlal_s8(vprod0x11, vget_high_s8(vb11), vget_high_s8(va0));
+        vprod1x11 = vmlal_s8(vprod1x11, vget_high_s8(vb11), vget_high_s8(va1));
+        vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11);
+        vacc1x11 = vpadalq_s16(vacc1x11, vprod1x11);
+        int16x8_t vprod0x12 = vmull_s8(vget_low_s8(vb12), vget_low_s8(va0));
+        int16x8_t vprod1x12 = vmull_s8(vget_low_s8(vb12), vget_low_s8(va1));
+        vprod0x12 = vmlal_s8(vprod0x12, vget_high_s8(vb12), vget_high_s8(va0));
+        vprod1x12 = vmlal_s8(vprod1x12, vget_high_s8(vb12), vget_high_s8(va1));
+        vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12);
+        vacc1x12 = vpadalq_s16(vacc1x12, vprod1x12);
+        int16x8_t vprod0x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va0));
+        int16x8_t vprod1x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va1));
+        vprod0x13 = vmlal_s8(vprod0x13, vget_high_s8(vb13), vget_high_s8(va0));
+        vprod1x13 = vmlal_s8(vprod1x13, vget_high_s8(vb13), vget_high_s8(va1));
+        vacc0x13 = vpadalq_s16(vacc0x13, vprod0x13);
+        vacc1x13 = vpadalq_s16(vacc1x13, vprod1x13);
+        int16x8_t vprod0x14 = vmull_s8(vget_low_s8(vb14), vget_low_s8(va0));
+        int16x8_t vprod1x14 = vmull_s8(vget_low_s8(vb14), vget_low_s8(va1));
+        vprod0x14 = vmlal_s8(vprod0x14, vget_high_s8(vb14), vget_high_s8(va0));
+        vprod1x14 = vmlal_s8(vprod1x14, vget_high_s8(vb14), vget_high_s8(va1));
+        vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14);
+        vacc1x14 = vpadalq_s16(vacc1x14, vprod1x14);
+        int16x8_t vprod0x15 = vmull_s8(vget_low_s8(vb15), vget_low_s8(va0));
+        int16x8_t vprod1x15 = vmull_s8(vget_low_s8(vb15), vget_low_s8(va1));
+        vprod0x15 = vmlal_s8(vprod0x15, vget_high_s8(vb15), vget_high_s8(va0));
+        vprod1x15 = vmlal_s8(vprod1x15, vget_high_s8(vb15), vget_high_s8(va1));
+        vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15);
+        vacc1x15 = vpadalq_s16(vacc1x15, vprod1x15);
+
+        k += 16 * sizeof(int8_t);
+      }
+
+      p -= 2 * sizeof(void*);
+    } while (p != 0);
+
+#if XNN_ARCH_ARM64
+    const int32x4_t vsum0x01 = vpaddq_s32(vacc0x0, vacc0x1);
+    const int32x4_t vsum0x23 = vpaddq_s32(vacc0x2, vacc0x3);
+    const int32x4_t vsum0x45 = vpaddq_s32(vacc0x4, vacc0x5);
+    const int32x4_t vsum0x67 = vpaddq_s32(vacc0x6, vacc0x7);
+    const int32x4_t vsum0x89 = vpaddq_s32(vacc0x8, vacc0x9);
+    const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11);
+    const int32x4_t vsum0xCD = vpaddq_s32(vacc0x12, vacc0x13);
+    const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15);
+    const int32x4_t vsum1x01 = vpaddq_s32(vacc1x0, vacc1x1);
+    const int32x4_t vsum1x23 = vpaddq_s32(vacc1x2, vacc1x3);
+    const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5);
+    const int32x4_t vsum1x67 = vpaddq_s32(vacc1x6, vacc1x7);
+    const int32x4_t vsum1x89 = vpaddq_s32(vacc1x8, vacc1x9);
+    const int32x4_t vsum1xAB = vpaddq_s32(vacc1x10, vacc1x11);
+    const int32x4_t vsum1xCD = vpaddq_s32(vacc1x12, vacc1x13);
+    const int32x4_t vsum1xEF = vpaddq_s32(vacc1x14, vacc1x15);
+    int32x4_t vacc0x0123 = vpaddq_s32(vsum0x01, vsum0x23);
+    int32x4_t vacc0x4567 = vpaddq_s32(vsum0x45, vsum0x67);
+    int32x4_t vacc0x89AB = vpaddq_s32(vsum0x89, vsum0xAB);
+    int32x4_t vacc0xCDEF = vpaddq_s32(vsum0xCD, vsum0xEF);
+    int32x4_t vacc1x0123 = vpaddq_s32(vsum1x01, vsum1x23);
+    int32x4_t vacc1x4567 = vpaddq_s32(vsum1x45, vsum1x67);
+    int32x4_t vacc1x89AB = vpaddq_s32(vsum1x89, vsum1xAB);
+    int32x4_t vacc1xCDEF = vpaddq_s32(vsum1xCD, vsum1xEF);
+#else
+    const int32x2_t vpsum0x0 = vadd_s32(vget_low_s32(vacc0x0), vget_high_s32(vacc0x0));
+    const int32x2_t vpsum0x1 = vadd_s32(vget_low_s32(vacc0x1), vget_high_s32(vacc0x1));
+    const int32x2_t vpsum0x2 = vadd_s32(vget_low_s32(vacc0x2), vget_high_s32(vacc0x2));
+    const int32x2_t vpsum0x3 = vadd_s32(vget_low_s32(vacc0x3), vget_high_s32(vacc0x3));
+    const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1);
+    const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3);
+    int32x4_t vacc0x0123 = vcombine_s32(vsum0x01, vsum0x23 );
+    const int32x2_t vpsum0x4 = vadd_s32(vget_low_s32(vacc0x4), vget_high_s32(vacc0x4));
+    const int32x2_t vpsum0x5 = vadd_s32(vget_low_s32(vacc0x5), vget_high_s32(vacc0x5));
+    const int32x2_t vpsum0x6 = vadd_s32(vget_low_s32(vacc0x6), vget_high_s32(vacc0x6));
+    const int32x2_t vpsum0x7 = vadd_s32(vget_low_s32(vacc0x7), vget_high_s32(vacc0x7));
+    const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5);
+    const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7);
+    int32x4_t vacc0x4567 = vcombine_s32(vsum0x45, vsum0x67 );
+    const int32x2_t vpsum0x8 = vadd_s32(vget_low_s32(vacc0x8), vget_high_s32(vacc0x8));
+    const int32x2_t vpsum0x9 = vadd_s32(vget_low_s32(vacc0x9), vget_high_s32(vacc0x9));
+    const int32x2_t vpsum0xA = vadd_s32(vget_low_s32(vacc0x10), vget_high_s32(vacc0x10));
+    const int32x2_t vpsum0xB = vadd_s32(vget_low_s32(vacc0x11), vget_high_s32(vacc0x11));
+    const int32x2_t vsum0x89 = vpadd_s32(vpsum0x8, vpsum0x9);
+    const int32x2_t vsum0xAB = vpadd_s32(vpsum0xA, vpsum0xB);
+    int32x4_t vacc0x89AB = vcombine_s32(vsum0x89, vsum0xAB );
+    const int32x2_t vpsum0xC = vadd_s32(vget_low_s32(vacc0x12), vget_high_s32(vacc0x12));
+    const int32x2_t vpsum0xD = vadd_s32(vget_low_s32(vacc0x13), vget_high_s32(vacc0x13));
+    const int32x2_t vpsum0xE = vadd_s32(vget_low_s32(vacc0x14), vget_high_s32(vacc0x14));
+    const int32x2_t vpsum0xF = vadd_s32(vget_low_s32(vacc0x15), vget_high_s32(vacc0x15));
+    const int32x2_t vsum0xCD = vpadd_s32(vpsum0xC, vpsum0xD);
+    const int32x2_t vsum0xEF = vpadd_s32(vpsum0xE, vpsum0xF);
+    int32x4_t vacc0xCDEF = vcombine_s32(vsum0xCD, vsum0xEF );
+    const int32x2_t vpsum1x0 = vadd_s32(vget_low_s32(vacc1x0), vget_high_s32(vacc1x0));
+    const int32x2_t vpsum1x1 = vadd_s32(vget_low_s32(vacc1x1), vget_high_s32(vacc1x1));
+    const int32x2_t vpsum1x2 = vadd_s32(vget_low_s32(vacc1x2), vget_high_s32(vacc1x2));
+    const int32x2_t vpsum1x3 = vadd_s32(vget_low_s32(vacc1x3), vget_high_s32(vacc1x3));
+    const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1);
+    const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3);
+    int32x4_t vacc1x0123 = vcombine_s32(vsum1x01, vsum1x23 );
+    const int32x2_t vpsum1x4 = vadd_s32(vget_low_s32(vacc1x4), vget_high_s32(vacc1x4));
+    const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5));
+    const int32x2_t vpsum1x6 = vadd_s32(vget_low_s32(vacc1x6), vget_high_s32(vacc1x6));
+    const int32x2_t vpsum1x7 = vadd_s32(vget_low_s32(vacc1x7), vget_high_s32(vacc1x7));
+    const int32x2_t vsum1x45 = vpadd_s32(vpsum1x4, vpsum1x5);
+    const int32x2_t vsum1x67 = vpadd_s32(vpsum1x6, vpsum1x7);
+    int32x4_t vacc1x4567 = vcombine_s32(vsum1x45, vsum1x67 );
+    const int32x2_t vpsum1x8 = vadd_s32(vget_low_s32(vacc1x8), vget_high_s32(vacc1x8));
+    const int32x2_t vpsum1x9 = vadd_s32(vget_low_s32(vacc1x9), vget_high_s32(vacc1x9));
+    const int32x2_t vpsum1xA = vadd_s32(vget_low_s32(vacc1x10), vget_high_s32(vacc1x10));
+    const int32x2_t vpsum1xB = vadd_s32(vget_low_s32(vacc1x11), vget_high_s32(vacc1x11));
+    const int32x2_t vsum1x89 = vpadd_s32(vpsum1x8, vpsum1x9);
+    const int32x2_t vsum1xAB = vpadd_s32(vpsum1xA, vpsum1xB);
+    int32x4_t vacc1x89AB = vcombine_s32(vsum1x89, vsum1xAB );
+    const int32x2_t vpsum1xC = vadd_s32(vget_low_s32(vacc1x12), vget_high_s32(vacc1x12));
+    const int32x2_t vpsum1xD = vadd_s32(vget_low_s32(vacc1x13), vget_high_s32(vacc1x13));
+    const int32x2_t vpsum1xE = vadd_s32(vget_low_s32(vacc1x14), vget_high_s32(vacc1x14));
+    const int32x2_t vpsum1xF = vadd_s32(vget_low_s32(vacc1x15), vget_high_s32(vacc1x15));
+    const int32x2_t vsum1xCD = vpadd_s32(vpsum1xC, vpsum1xD);
+    const int32x2_t vsum1xEF = vpadd_s32(vpsum1xE, vpsum1xF);
+    int32x4_t vacc1xCDEF = vcombine_s32(vsum1xCD, vsum1xEF );
+#endif
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+    vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc1x89AB = vqrdmulhq_s32(vacc1x89AB, vmultiplier);
+    vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+    vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc1x89AB = vsraq_n_s32(vacc1x89AB, vbicq_s32(vacc1x89AB, vzero_shift_mask), 31);
+    vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_shift);
+    vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+    int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+    int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
+    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+
+    vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
+    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+
+    if (nc >= 16) {
+      vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
+      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 16;
+    } else {
+      int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
+      if (nc & 8) {
+        vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
+        vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
+        vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
+      }
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/2x16c2-minmax-neon-mlal-padal-dup.c b/src/qs8-igemm/gen/2x16c2-minmax-neon-mlal-padal-dup.c
new file mode 100644
index 0000000..d35ce3f
--- /dev/null
+++ b/src/qs8-igemm/gen/2x16c2-minmax-neon-mlal-padal-dup.c
@@ -0,0 +1,471 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/c2-neon-mull-padal-dup.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (2 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    c1 = c0;
+  }
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc1x0123 = vacc0x0123;
+    int32x4_t vacc1x4567 = vacc0x4567;
+    int32x4_t vacc1x89AB = vacc0x89AB;
+    int32x4_t vacc1xCDEF = vacc0xCDEF;
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      a += 2;
+
+      size_t k = kc;
+
+      while (k >= 16 * sizeof(int8_t)) {
+        const int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va0x1 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va1x0 = vld1_s8(a1); a1 += 8;
+        const int8x8_t va1x1 = vld1_s8(a1); a1 += 8;
+
+        const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+        int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+        const int8x8_t vb0123c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+        vprod1x0123c0 = vmlal_s8(vprod1x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+        int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+        int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+        const int8x8_t vb4567c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+        vprod1x4567c0 = vmlal_s8(vprod1x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+        int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+        int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+        const int8x8_t vb89ABc0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x89ABc0 = vmlal_s8(vprod0x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+        vprod1x89ABc0 = vmlal_s8(vprod1x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+        vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
+        int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+        int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+        const int8x8_t vbCDEFc0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0xCDEFc0 = vmlal_s8(vprod0xCDEFc0, vbCDEFc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+        vprod1xCDEFc0 = vmlal_s8(vprod1xCDEFc0, vbCDEFc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+        vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
+        int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+        int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+        const int8x8_t vb0123c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x0123c1 = vmlal_s8(vprod0x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+        vprod1x0123c1 = vmlal_s8(vprod1x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+        int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+        int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+        const int8x8_t vb4567c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x4567c1 = vmlal_s8(vprod0x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+        vprod1x4567c1 = vmlal_s8(vprod1x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+        int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+        int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+        const int8x8_t vb89ABc1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x89ABc1 = vmlal_s8(vprod0x89ABc1, vb89ABc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+        vprod1x89ABc1 = vmlal_s8(vprod1x89ABc1, vb89ABc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+        vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
+        int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+        int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+        const int8x8_t vbCDEFc1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0xCDEFc1 = vmlal_s8(vprod0xCDEFc1, vbCDEFc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+        vprod1xCDEFc1 = vmlal_s8(vprod1xCDEFc1, vbCDEFc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+        vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
+        int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+        int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+        const int8x8_t vb0123c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x0123c2 = vmlal_s8(vprod0x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+        vprod1x0123c2 = vmlal_s8(vprod1x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+        int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+        int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+        const int8x8_t vb4567c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x4567c2 = vmlal_s8(vprod0x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+        vprod1x4567c2 = vmlal_s8(vprod1x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+        int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+        int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+        const int8x8_t vb89ABc2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x89ABc2 = vmlal_s8(vprod0x89ABc2, vb89ABc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+        vprod1x89ABc2 = vmlal_s8(vprod1x89ABc2, vb89ABc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+        vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
+        int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+        int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+        const int8x8_t vbCDEFc2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0xCDEFc2 = vmlal_s8(vprod0xCDEFc2, vbCDEFc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+        vprod1xCDEFc2 = vmlal_s8(vprod1xCDEFc2, vbCDEFc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+        vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
+        int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+        int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+        const int8x8_t vb0123c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x0123c3 = vmlal_s8(vprod0x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+        vprod1x0123c3 = vmlal_s8(vprod1x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+        int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+        int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+        const int8x8_t vb4567c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x4567c3 = vmlal_s8(vprod0x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+        vprod1x4567c3 = vmlal_s8(vprod1x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+        int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+        int16x8_t vprod1x89ABc3 = vmull_s8(vb89ABc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+        const int8x8_t vb89ABc3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x89ABc3 = vmlal_s8(vprod0x89ABc3, vb89ABc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+        vprod1x89ABc3 = vmlal_s8(vprod1x89ABc3, vb89ABc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
+        vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc3);
+        int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+        int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+        const int8x8_t vbCDEFc3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0xCDEFc3 = vmlal_s8(vprod0xCDEFc3, vbCDEFc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+        vprod1xCDEFc3 = vmlal_s8(vprod1xCDEFc3, vbCDEFc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
+        vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3);
+
+        k -= 16 * sizeof(int8_t);
+      }
+
+      if (k >= 8 * sizeof(int8_t)) {
+        const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+
+        const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+        const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+        const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+        const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+        const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+        const int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
+        const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+        const int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
+        const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+        const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+        const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+        const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+        const int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        const int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+        const int16x8_t vprod1x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+        vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
+        vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
+        vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
+        vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc3);
+        const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+        const int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+        vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
+        vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
+        vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
+        vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3);
+
+        k -= 8 * sizeof(int8_t);
+      }
+
+      if XNN_UNLIKELY(k != 0) {
+        const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+        const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+
+        const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+        const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+        const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+        const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+        const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+        const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+        const int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
+        const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
+
+        if (k > 2 * sizeof(int8_t)) {
+          const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+          vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+          const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+          vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+          const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+          vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+          const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+          vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+          const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+          vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+          const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+          vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+          const int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+          vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
+          const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+          vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
+
+          if (k > 4 * sizeof(int8_t)) {
+            const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+            const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+            vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+            const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+            vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+            const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+            vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+            const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+            vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+            const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+            vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+            const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+            vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+            const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+            vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
+            const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+            vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
+          }
+        }
+      }
+      p -= 2 * sizeof(void*);
+    } while (p != 0);
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+    vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc1x89AB = vqrdmulhq_s32(vacc1x89AB, vmultiplier);
+    vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+    vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc1x89AB = vsraq_n_s32(vacc1x89AB, vbicq_s32(vacc1x89AB, vzero_shift_mask), 31);
+    vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_shift);
+    vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+    int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+    int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
+    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+
+    vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
+    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+
+    if (nc >= 16) {
+      vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
+      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 16;
+    } else {
+      int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
+      if (nc & 8) {
+        vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
+        vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
+        vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
+      }
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/2x16c2-minmax-neon-mull-padal-dup.c b/src/qs8-igemm/gen/2x16c2-minmax-neon-mull-padal-dup.c
new file mode 100644
index 0000000..b13ce66
--- /dev/null
+++ b/src/qs8-igemm/gen/2x16c2-minmax-neon-mull-padal-dup.c
@@ -0,0 +1,333 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/c2-neon-mull-padal-dup.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (2 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    c1 = c0;
+  }
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc1x0123 = vacc0x0123;
+    int32x4_t vacc1x4567 = vacc0x4567;
+    int32x4_t vacc1x89AB = vacc0x89AB;
+    int32x4_t vacc1xCDEF = vacc0xCDEF;
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      a += 2;
+
+      size_t k = kc;
+
+
+      while (k >= 8 * sizeof(int8_t)) {
+        const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+
+        const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+        const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+        const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+        const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+        const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+        const int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
+        const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+        const int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
+        const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+        const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+        const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+        const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+        const int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        const int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+        const int16x8_t vprod1x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+        vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
+        vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
+        vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
+        vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc3);
+        const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+        const int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+        vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
+        vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
+        vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
+        vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3);
+
+        k -= 8 * sizeof(int8_t);
+      }
+
+      if XNN_UNLIKELY(k != 0) {
+        const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+        const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+
+        const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+        const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+        const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+        const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+        const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+        const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+        const int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
+        const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
+
+        if (k > 2 * sizeof(int8_t)) {
+          const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+          vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+          const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+          vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+          const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+          vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+          const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+          vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+          const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+          vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+          const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+          vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+          const int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+          vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
+          const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+          vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
+
+          if (k > 4 * sizeof(int8_t)) {
+            const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+            const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+            vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+            const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+            vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+            const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+            vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+            const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+            vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+            const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+            vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+            const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+            vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+            const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+            vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
+            const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+            vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
+          }
+        }
+      }
+      p -= 2 * sizeof(void*);
+    } while (p != 0);
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+    vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc1x89AB = vqrdmulhq_s32(vacc1x89AB, vmultiplier);
+    vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+    vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc1x89AB = vsraq_n_s32(vacc1x89AB, vbicq_s32(vacc1x89AB, vzero_shift_mask), 31);
+    vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_shift);
+    vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+    int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+    int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
+    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+
+    vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
+    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+
+    if (nc >= 16) {
+      vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
+      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 16;
+    } else {
+      int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
+      if (nc & 8) {
+        vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
+        vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
+        vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
+      }
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/2x16c8-minmax-avx512skx.c b/src/qs8-igemm/gen/2x16c8-minmax-avx512skx.c
index 347ed78..328bc55 100644
--- a/src/qs8-igemm/gen/2x16c8-minmax-avx512skx.c
+++ b/src/qs8-igemm/gen/2x16c8-minmax-avx512skx.c
@@ -13,6 +13,7 @@
 
 #include <xnnpack/igemm.h>
 #include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx(
@@ -38,6 +39,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr != 2) {
diff --git a/src/qs8-igemm/gen/2x16c8-minmax-neon-mlal-padal.c b/src/qs8-igemm/gen/2x16c8-minmax-neon-mlal-padal.c
new file mode 100644
index 0000000..6263b15
--- /dev/null
+++ b/src/qs8-igemm/gen/2x16c8-minmax-neon-mlal-padal.c
@@ -0,0 +1,504 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/c8-neon-mull-padal.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (2 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    c1 = c0;
+  }
+
+  do {
+    int32x4_t vacc0x0 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x1 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x2 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x3 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x4 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x5 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x8 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x9 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x10 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x11 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x12 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x13 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x14 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x15 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc1x0 = vacc0x0;
+    int32x4_t vacc1x1 = vacc0x1;
+    int32x4_t vacc1x2 = vacc0x2;
+    int32x4_t vacc1x3 = vacc0x3;
+    int32x4_t vacc1x4 = vacc0x4;
+    int32x4_t vacc1x5 = vacc0x5;
+    int32x4_t vacc1x6 = vacc0x6;
+    int32x4_t vacc1x7 = vacc0x7;
+    int32x4_t vacc1x8 = vacc0x8;
+    int32x4_t vacc1x9 = vacc0x9;
+    int32x4_t vacc1x10 = vacc0x10;
+    int32x4_t vacc1x11 = vacc0x11;
+    int32x4_t vacc1x12 = vacc0x12;
+    int32x4_t vacc1x13 = vacc0x13;
+    int32x4_t vacc1x14 = vacc0x14;
+    int32x4_t vacc1x15 = vacc0x15;
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      a += 2;
+
+      size_t k = kc;
+      // 2x partial unrolled loop to load 16 bytes at a time using MLA.
+      while (k >= 16 * sizeof(int8_t)) {
+        const int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va0x1 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va1x0 = vld1_s8(a1); a1 += 8;
+        const int8x8_t va1x1 = vld1_s8(a1); a1 += 8;
+
+        const int8x8_t vb0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb4x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb5x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb6x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb7x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb8x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb9x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb10x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb11x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb12x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb13x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb14x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb15x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+
+        const int8x8_t vb0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x0 = vmull_s8(vb0x0, va0x0);
+        int16x8_t vprod1x0 = vmull_s8(vb0x0, va1x0);
+        vprod0x0 = vmlal_s8(vprod0x0, vb0x1, va0x1);
+        vprod1x0 = vmlal_s8(vprod1x0, vb0x1, va1x1);
+        vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+        vacc1x0 = vpadalq_s16(vacc1x0, vprod1x0);
+        const int8x8_t vb1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x1 = vmull_s8(vb1x0, va0x0);
+        int16x8_t vprod1x1 = vmull_s8(vb1x0, va1x0);
+        vprod0x1 = vmlal_s8(vprod0x1, vb1x1, va0x1);
+        vprod1x1 = vmlal_s8(vprod1x1, vb1x1, va1x1);
+        vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+        vacc1x1 = vpadalq_s16(vacc1x1, vprod1x1);
+        const int8x8_t vb2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x2 = vmull_s8(vb2x0, va0x0);
+        int16x8_t vprod1x2 = vmull_s8(vb2x0, va1x0);
+        vprod0x2 = vmlal_s8(vprod0x2, vb2x1, va0x1);
+        vprod1x2 = vmlal_s8(vprod1x2, vb2x1, va1x1);
+        vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+        vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2);
+        const int8x8_t vb3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x3 = vmull_s8(vb3x0, va0x0);
+        int16x8_t vprod1x3 = vmull_s8(vb3x0, va1x0);
+        vprod0x3 = vmlal_s8(vprod0x3, vb3x1, va0x1);
+        vprod1x3 = vmlal_s8(vprod1x3, vb3x1, va1x1);
+        vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+        vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3);
+        const int8x8_t vb4x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x4 = vmull_s8(vb4x0, va0x0);
+        int16x8_t vprod1x4 = vmull_s8(vb4x0, va1x0);
+        vprod0x4 = vmlal_s8(vprod0x4, vb4x1, va0x1);
+        vprod1x4 = vmlal_s8(vprod1x4, vb4x1, va1x1);
+        vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+        vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4);
+        const int8x8_t vb5x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x5 = vmull_s8(vb5x0, va0x0);
+        int16x8_t vprod1x5 = vmull_s8(vb5x0, va1x0);
+        vprod0x5 = vmlal_s8(vprod0x5, vb5x1, va0x1);
+        vprod1x5 = vmlal_s8(vprod1x5, vb5x1, va1x1);
+        vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+        vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5);
+        const int8x8_t vb6x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x6 = vmull_s8(vb6x0, va0x0);
+        int16x8_t vprod1x6 = vmull_s8(vb6x0, va1x0);
+        vprod0x6 = vmlal_s8(vprod0x6, vb6x1, va0x1);
+        vprod1x6 = vmlal_s8(vprod1x6, vb6x1, va1x1);
+        vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+        vacc1x6 = vpadalq_s16(vacc1x6, vprod1x6);
+        const int8x8_t vb7x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x7 = vmull_s8(vb7x0, va0x0);
+        int16x8_t vprod1x7 = vmull_s8(vb7x0, va1x0);
+        vprod0x7 = vmlal_s8(vprod0x7, vb7x1, va0x1);
+        vprod1x7 = vmlal_s8(vprod1x7, vb7x1, va1x1);
+        vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+        vacc1x7 = vpadalq_s16(vacc1x7, vprod1x7);
+        const int8x8_t vb8x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x8 = vmull_s8(vb8x0, va0x0);
+        int16x8_t vprod1x8 = vmull_s8(vb8x0, va1x0);
+        vprod0x8 = vmlal_s8(vprod0x8, vb8x1, va0x1);
+        vprod1x8 = vmlal_s8(vprod1x8, vb8x1, va1x1);
+        vacc0x8 = vpadalq_s16(vacc0x8, vprod0x8);
+        vacc1x8 = vpadalq_s16(vacc1x8, vprod1x8);
+        const int8x8_t vb9x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x9 = vmull_s8(vb9x0, va0x0);
+        int16x8_t vprod1x9 = vmull_s8(vb9x0, va1x0);
+        vprod0x9 = vmlal_s8(vprod0x9, vb9x1, va0x1);
+        vprod1x9 = vmlal_s8(vprod1x9, vb9x1, va1x1);
+        vacc0x9 = vpadalq_s16(vacc0x9, vprod0x9);
+        vacc1x9 = vpadalq_s16(vacc1x9, vprod1x9);
+        const int8x8_t vb10x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x10 = vmull_s8(vb10x0, va0x0);
+        int16x8_t vprod1x10 = vmull_s8(vb10x0, va1x0);
+        vprod0x10 = vmlal_s8(vprod0x10, vb10x1, va0x1);
+        vprod1x10 = vmlal_s8(vprod1x10, vb10x1, va1x1);
+        vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10);
+        vacc1x10 = vpadalq_s16(vacc1x10, vprod1x10);
+        const int8x8_t vb11x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x11 = vmull_s8(vb11x0, va0x0);
+        int16x8_t vprod1x11 = vmull_s8(vb11x0, va1x0);
+        vprod0x11 = vmlal_s8(vprod0x11, vb11x1, va0x1);
+        vprod1x11 = vmlal_s8(vprod1x11, vb11x1, va1x1);
+        vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11);
+        vacc1x11 = vpadalq_s16(vacc1x11, vprod1x11);
+        const int8x8_t vb12x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x12 = vmull_s8(vb12x0, va0x0);
+        int16x8_t vprod1x12 = vmull_s8(vb12x0, va1x0);
+        vprod0x12 = vmlal_s8(vprod0x12, vb12x1, va0x1);
+        vprod1x12 = vmlal_s8(vprod1x12, vb12x1, va1x1);
+        vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12);
+        vacc1x12 = vpadalq_s16(vacc1x12, vprod1x12);
+        const int8x8_t vb13x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x13 = vmull_s8(vb13x0, va0x0);
+        int16x8_t vprod1x13 = vmull_s8(vb13x0, va1x0);
+        vprod0x13 = vmlal_s8(vprod0x13, vb13x1, va0x1);
+        vprod1x13 = vmlal_s8(vprod1x13, vb13x1, va1x1);
+        vacc0x13 = vpadalq_s16(vacc0x13, vprod0x13);
+        vacc1x13 = vpadalq_s16(vacc1x13, vprod1x13);
+        const int8x8_t vb14x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x14 = vmull_s8(vb14x0, va0x0);
+        int16x8_t vprod1x14 = vmull_s8(vb14x0, va1x0);
+        vprod0x14 = vmlal_s8(vprod0x14, vb14x1, va0x1);
+        vprod1x14 = vmlal_s8(vprod1x14, vb14x1, va1x1);
+        vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14);
+        vacc1x14 = vpadalq_s16(vacc1x14, vprod1x14);
+        const int8x8_t vb15x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x15 = vmull_s8(vb15x0, va0x0);
+        int16x8_t vprod1x15 = vmull_s8(vb15x0, va1x0);
+        vprod0x15 = vmlal_s8(vprod0x15, vb15x1, va0x1);
+        vprod1x15 = vmlal_s8(vprod1x15, vb15x1, va1x1);
+        vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15);
+        vacc1x15 = vpadalq_s16(vacc1x15, vprod1x15);
+
+        k -= 16 * sizeof(int8_t);
+      }
+
+      // Handle 8 bytes at a time using MUL.
+      if (k > 0) {
+        const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+
+        const int8x8_t vb0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x0 = vmull_s8(vb0, va0);
+        const int16x8_t vprod1x0 = vmull_s8(vb0, va1);
+        vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+        vacc1x0 = vpadalq_s16(vacc1x0, vprod1x0);
+        const int8x8_t vb1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x1 = vmull_s8(vb1, va0);
+        const int16x8_t vprod1x1 = vmull_s8(vb1, va1);
+        vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+        vacc1x1 = vpadalq_s16(vacc1x1, vprod1x1);
+        const int8x8_t vb2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x2 = vmull_s8(vb2, va0);
+        const int16x8_t vprod1x2 = vmull_s8(vb2, va1);
+        vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+        vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2);
+        const int8x8_t vb3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x3 = vmull_s8(vb3, va0);
+        const int16x8_t vprod1x3 = vmull_s8(vb3, va1);
+        vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+        vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3);
+        const int8x8_t vb4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x4 = vmull_s8(vb4, va0);
+        const int16x8_t vprod1x4 = vmull_s8(vb4, va1);
+        vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+        vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4);
+        const int8x8_t vb5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x5 = vmull_s8(vb5, va0);
+        const int16x8_t vprod1x5 = vmull_s8(vb5, va1);
+        vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+        vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5);
+        const int8x8_t vb6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x6 = vmull_s8(vb6, va0);
+        const int16x8_t vprod1x6 = vmull_s8(vb6, va1);
+        vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+        vacc1x6 = vpadalq_s16(vacc1x6, vprod1x6);
+        const int8x8_t vb7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x7 = vmull_s8(vb7, va0);
+        const int16x8_t vprod1x7 = vmull_s8(vb7, va1);
+        vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+        vacc1x7 = vpadalq_s16(vacc1x7, vprod1x7);
+        const int8x8_t vb8 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x8 = vmull_s8(vb8, va0);
+        const int16x8_t vprod1x8 = vmull_s8(vb8, va1);
+        vacc0x8 = vpadalq_s16(vacc0x8, vprod0x8);
+        vacc1x8 = vpadalq_s16(vacc1x8, vprod1x8);
+        const int8x8_t vb9 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x9 = vmull_s8(vb9, va0);
+        const int16x8_t vprod1x9 = vmull_s8(vb9, va1);
+        vacc0x9 = vpadalq_s16(vacc0x9, vprod0x9);
+        vacc1x9 = vpadalq_s16(vacc1x9, vprod1x9);
+        const int8x8_t vb10 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x10 = vmull_s8(vb10, va0);
+        const int16x8_t vprod1x10 = vmull_s8(vb10, va1);
+        vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10);
+        vacc1x10 = vpadalq_s16(vacc1x10, vprod1x10);
+        const int8x8_t vb11 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x11 = vmull_s8(vb11, va0);
+        const int16x8_t vprod1x11 = vmull_s8(vb11, va1);
+        vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11);
+        vacc1x11 = vpadalq_s16(vacc1x11, vprod1x11);
+        const int8x8_t vb12 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x12 = vmull_s8(vb12, va0);
+        const int16x8_t vprod1x12 = vmull_s8(vb12, va1);
+        vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12);
+        vacc1x12 = vpadalq_s16(vacc1x12, vprod1x12);
+        const int8x8_t vb13 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x13 = vmull_s8(vb13, va0);
+        const int16x8_t vprod1x13 = vmull_s8(vb13, va1);
+        vacc0x13 = vpadalq_s16(vacc0x13, vprod0x13);
+        vacc1x13 = vpadalq_s16(vacc1x13, vprod1x13);
+        const int8x8_t vb14 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x14 = vmull_s8(vb14, va0);
+        const int16x8_t vprod1x14 = vmull_s8(vb14, va1);
+        vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14);
+        vacc1x14 = vpadalq_s16(vacc1x14, vprod1x14);
+        const int8x8_t vb15 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x15 = vmull_s8(vb15, va0);
+        const int16x8_t vprod1x15 = vmull_s8(vb15, va1);
+        vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15);
+        vacc1x15 = vpadalq_s16(vacc1x15, vprod1x15);
+
+        k -= 8 * sizeof(int8_t);
+      }
+
+      p -= 2 * sizeof(void*);
+    } while (p != 0);
+
+#if XNN_ARCH_ARM64
+    const int32x4_t vsum0x01 = vpaddq_s32(vacc0x0, vacc0x1);
+    const int32x4_t vsum0x23 = vpaddq_s32(vacc0x2, vacc0x3);
+    const int32x4_t vsum0x45 = vpaddq_s32(vacc0x4, vacc0x5);
+    const int32x4_t vsum0x67 = vpaddq_s32(vacc0x6, vacc0x7);
+    const int32x4_t vsum0x89 = vpaddq_s32(vacc0x8, vacc0x9);
+    const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11);
+    const int32x4_t vsum0xCD = vpaddq_s32(vacc0x12, vacc0x13);
+    const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15);
+    const int32x4_t vsum1x01 = vpaddq_s32(vacc1x0, vacc1x1);
+    const int32x4_t vsum1x23 = vpaddq_s32(vacc1x2, vacc1x3);
+    const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5);
+    const int32x4_t vsum1x67 = vpaddq_s32(vacc1x6, vacc1x7);
+    const int32x4_t vsum1x89 = vpaddq_s32(vacc1x8, vacc1x9);
+    const int32x4_t vsum1xAB = vpaddq_s32(vacc1x10, vacc1x11);
+    const int32x4_t vsum1xCD = vpaddq_s32(vacc1x12, vacc1x13);
+    const int32x4_t vsum1xEF = vpaddq_s32(vacc1x14, vacc1x15);
+    int32x4_t vacc0x0123 = vpaddq_s32(vsum0x01, vsum0x23);
+    int32x4_t vacc0x4567 = vpaddq_s32(vsum0x45, vsum0x67);
+    int32x4_t vacc0x89AB = vpaddq_s32(vsum0x89, vsum0xAB);
+    int32x4_t vacc0xCDEF = vpaddq_s32(vsum0xCD, vsum0xEF);
+    int32x4_t vacc1x0123 = vpaddq_s32(vsum1x01, vsum1x23);
+    int32x4_t vacc1x4567 = vpaddq_s32(vsum1x45, vsum1x67);
+    int32x4_t vacc1x89AB = vpaddq_s32(vsum1x89, vsum1xAB);
+    int32x4_t vacc1xCDEF = vpaddq_s32(vsum1xCD, vsum1xEF);
+#else
+    const int32x2_t vpsum0x0 = vadd_s32(vget_low_s32(vacc0x0), vget_high_s32(vacc0x0));
+    const int32x2_t vpsum0x1 = vadd_s32(vget_low_s32(vacc0x1), vget_high_s32(vacc0x1));
+    const int32x2_t vpsum0x2 = vadd_s32(vget_low_s32(vacc0x2), vget_high_s32(vacc0x2));
+    const int32x2_t vpsum0x3 = vadd_s32(vget_low_s32(vacc0x3), vget_high_s32(vacc0x3));
+    const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1);
+    const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3);
+    int32x4_t vacc0x0123 = vcombine_s32(vsum0x01, vsum0x23 );
+    const int32x2_t vpsum0x4 = vadd_s32(vget_low_s32(vacc0x4), vget_high_s32(vacc0x4));
+    const int32x2_t vpsum0x5 = vadd_s32(vget_low_s32(vacc0x5), vget_high_s32(vacc0x5));
+    const int32x2_t vpsum0x6 = vadd_s32(vget_low_s32(vacc0x6), vget_high_s32(vacc0x6));
+    const int32x2_t vpsum0x7 = vadd_s32(vget_low_s32(vacc0x7), vget_high_s32(vacc0x7));
+    const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5);
+    const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7);
+    int32x4_t vacc0x4567 = vcombine_s32(vsum0x45, vsum0x67 );
+    const int32x2_t vpsum0x8 = vadd_s32(vget_low_s32(vacc0x8), vget_high_s32(vacc0x8));
+    const int32x2_t vpsum0x9 = vadd_s32(vget_low_s32(vacc0x9), vget_high_s32(vacc0x9));
+    const int32x2_t vpsum0xA = vadd_s32(vget_low_s32(vacc0x10), vget_high_s32(vacc0x10));
+    const int32x2_t vpsum0xB = vadd_s32(vget_low_s32(vacc0x11), vget_high_s32(vacc0x11));
+    const int32x2_t vsum0x89 = vpadd_s32(vpsum0x8, vpsum0x9);
+    const int32x2_t vsum0xAB = vpadd_s32(vpsum0xA, vpsum0xB);
+    int32x4_t vacc0x89AB = vcombine_s32(vsum0x89, vsum0xAB );
+    const int32x2_t vpsum0xC = vadd_s32(vget_low_s32(vacc0x12), vget_high_s32(vacc0x12));
+    const int32x2_t vpsum0xD = vadd_s32(vget_low_s32(vacc0x13), vget_high_s32(vacc0x13));
+    const int32x2_t vpsum0xE = vadd_s32(vget_low_s32(vacc0x14), vget_high_s32(vacc0x14));
+    const int32x2_t vpsum0xF = vadd_s32(vget_low_s32(vacc0x15), vget_high_s32(vacc0x15));
+    const int32x2_t vsum0xCD = vpadd_s32(vpsum0xC, vpsum0xD);
+    const int32x2_t vsum0xEF = vpadd_s32(vpsum0xE, vpsum0xF);
+    int32x4_t vacc0xCDEF = vcombine_s32(vsum0xCD, vsum0xEF );
+    const int32x2_t vpsum1x0 = vadd_s32(vget_low_s32(vacc1x0), vget_high_s32(vacc1x0));
+    const int32x2_t vpsum1x1 = vadd_s32(vget_low_s32(vacc1x1), vget_high_s32(vacc1x1));
+    const int32x2_t vpsum1x2 = vadd_s32(vget_low_s32(vacc1x2), vget_high_s32(vacc1x2));
+    const int32x2_t vpsum1x3 = vadd_s32(vget_low_s32(vacc1x3), vget_high_s32(vacc1x3));
+    const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1);
+    const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3);
+    int32x4_t vacc1x0123 = vcombine_s32(vsum1x01, vsum1x23 );
+    const int32x2_t vpsum1x4 = vadd_s32(vget_low_s32(vacc1x4), vget_high_s32(vacc1x4));
+    const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5));
+    const int32x2_t vpsum1x6 = vadd_s32(vget_low_s32(vacc1x6), vget_high_s32(vacc1x6));
+    const int32x2_t vpsum1x7 = vadd_s32(vget_low_s32(vacc1x7), vget_high_s32(vacc1x7));
+    const int32x2_t vsum1x45 = vpadd_s32(vpsum1x4, vpsum1x5);
+    const int32x2_t vsum1x67 = vpadd_s32(vpsum1x6, vpsum1x7);
+    int32x4_t vacc1x4567 = vcombine_s32(vsum1x45, vsum1x67 );
+    const int32x2_t vpsum1x8 = vadd_s32(vget_low_s32(vacc1x8), vget_high_s32(vacc1x8));
+    const int32x2_t vpsum1x9 = vadd_s32(vget_low_s32(vacc1x9), vget_high_s32(vacc1x9));
+    const int32x2_t vpsum1xA = vadd_s32(vget_low_s32(vacc1x10), vget_high_s32(vacc1x10));
+    const int32x2_t vpsum1xB = vadd_s32(vget_low_s32(vacc1x11), vget_high_s32(vacc1x11));
+    const int32x2_t vsum1x89 = vpadd_s32(vpsum1x8, vpsum1x9);
+    const int32x2_t vsum1xAB = vpadd_s32(vpsum1xA, vpsum1xB);
+    int32x4_t vacc1x89AB = vcombine_s32(vsum1x89, vsum1xAB );
+    const int32x2_t vpsum1xC = vadd_s32(vget_low_s32(vacc1x12), vget_high_s32(vacc1x12));
+    const int32x2_t vpsum1xD = vadd_s32(vget_low_s32(vacc1x13), vget_high_s32(vacc1x13));
+    const int32x2_t vpsum1xE = vadd_s32(vget_low_s32(vacc1x14), vget_high_s32(vacc1x14));
+    const int32x2_t vpsum1xF = vadd_s32(vget_low_s32(vacc1x15), vget_high_s32(vacc1x15));
+    const int32x2_t vsum1xCD = vpadd_s32(vpsum1xC, vpsum1xD);
+    const int32x2_t vsum1xEF = vpadd_s32(vpsum1xE, vpsum1xF);
+    int32x4_t vacc1xCDEF = vcombine_s32(vsum1xCD, vsum1xEF );
+#endif
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+    vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc1x89AB = vqrdmulhq_s32(vacc1x89AB, vmultiplier);
+    vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+    vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc1x89AB = vsraq_n_s32(vacc1x89AB, vbicq_s32(vacc1x89AB, vzero_shift_mask), 31);
+    vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_shift);
+    vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+    int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+    int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
+    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+
+    vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
+    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+
+    if (nc >= 16) {
+      vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
+      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 16;
+    } else {
+      int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
+      if (nc & 8) {
+        vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
+        vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
+        vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
+      }
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/2x16c8-minmax-neon-mull-padal.c b/src/qs8-igemm/gen/2x16c8-minmax-neon-mull-padal.c
new file mode 100644
index 0000000..f784e33
--- /dev/null
+++ b/src/qs8-igemm/gen/2x16c8-minmax-neon-mull-padal.c
@@ -0,0 +1,365 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/c8-neon-mull-padal.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (2 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    c1 = c0;
+  }
+
+  do {
+    int32x4_t vacc0x0 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x1 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x2 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x3 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x4 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x5 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x8 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x9 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x10 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x11 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x12 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x13 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x14 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x15 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc1x0 = vacc0x0;
+    int32x4_t vacc1x1 = vacc0x1;
+    int32x4_t vacc1x2 = vacc0x2;
+    int32x4_t vacc1x3 = vacc0x3;
+    int32x4_t vacc1x4 = vacc0x4;
+    int32x4_t vacc1x5 = vacc0x5;
+    int32x4_t vacc1x6 = vacc0x6;
+    int32x4_t vacc1x7 = vacc0x7;
+    int32x4_t vacc1x8 = vacc0x8;
+    int32x4_t vacc1x9 = vacc0x9;
+    int32x4_t vacc1x10 = vacc0x10;
+    int32x4_t vacc1x11 = vacc0x11;
+    int32x4_t vacc1x12 = vacc0x12;
+    int32x4_t vacc1x13 = vacc0x13;
+    int32x4_t vacc1x14 = vacc0x14;
+    int32x4_t vacc1x15 = vacc0x15;
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      a += 2;
+
+      size_t k = kc;
+
+      // Handle 8 bytes at a time using MUL.
+      while (k > 0) {
+        const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+
+        const int8x8_t vb0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x0 = vmull_s8(vb0, va0);
+        const int16x8_t vprod1x0 = vmull_s8(vb0, va1);
+        vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+        vacc1x0 = vpadalq_s16(vacc1x0, vprod1x0);
+        const int8x8_t vb1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x1 = vmull_s8(vb1, va0);
+        const int16x8_t vprod1x1 = vmull_s8(vb1, va1);
+        vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+        vacc1x1 = vpadalq_s16(vacc1x1, vprod1x1);
+        const int8x8_t vb2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x2 = vmull_s8(vb2, va0);
+        const int16x8_t vprod1x2 = vmull_s8(vb2, va1);
+        vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+        vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2);
+        const int8x8_t vb3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x3 = vmull_s8(vb3, va0);
+        const int16x8_t vprod1x3 = vmull_s8(vb3, va1);
+        vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+        vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3);
+        const int8x8_t vb4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x4 = vmull_s8(vb4, va0);
+        const int16x8_t vprod1x4 = vmull_s8(vb4, va1);
+        vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+        vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4);
+        const int8x8_t vb5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x5 = vmull_s8(vb5, va0);
+        const int16x8_t vprod1x5 = vmull_s8(vb5, va1);
+        vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+        vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5);
+        const int8x8_t vb6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x6 = vmull_s8(vb6, va0);
+        const int16x8_t vprod1x6 = vmull_s8(vb6, va1);
+        vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+        vacc1x6 = vpadalq_s16(vacc1x6, vprod1x6);
+        const int8x8_t vb7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x7 = vmull_s8(vb7, va0);
+        const int16x8_t vprod1x7 = vmull_s8(vb7, va1);
+        vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+        vacc1x7 = vpadalq_s16(vacc1x7, vprod1x7);
+        const int8x8_t vb8 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x8 = vmull_s8(vb8, va0);
+        const int16x8_t vprod1x8 = vmull_s8(vb8, va1);
+        vacc0x8 = vpadalq_s16(vacc0x8, vprod0x8);
+        vacc1x8 = vpadalq_s16(vacc1x8, vprod1x8);
+        const int8x8_t vb9 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x9 = vmull_s8(vb9, va0);
+        const int16x8_t vprod1x9 = vmull_s8(vb9, va1);
+        vacc0x9 = vpadalq_s16(vacc0x9, vprod0x9);
+        vacc1x9 = vpadalq_s16(vacc1x9, vprod1x9);
+        const int8x8_t vb10 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x10 = vmull_s8(vb10, va0);
+        const int16x8_t vprod1x10 = vmull_s8(vb10, va1);
+        vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10);
+        vacc1x10 = vpadalq_s16(vacc1x10, vprod1x10);
+        const int8x8_t vb11 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x11 = vmull_s8(vb11, va0);
+        const int16x8_t vprod1x11 = vmull_s8(vb11, va1);
+        vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11);
+        vacc1x11 = vpadalq_s16(vacc1x11, vprod1x11);
+        const int8x8_t vb12 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x12 = vmull_s8(vb12, va0);
+        const int16x8_t vprod1x12 = vmull_s8(vb12, va1);
+        vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12);
+        vacc1x12 = vpadalq_s16(vacc1x12, vprod1x12);
+        const int8x8_t vb13 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x13 = vmull_s8(vb13, va0);
+        const int16x8_t vprod1x13 = vmull_s8(vb13, va1);
+        vacc0x13 = vpadalq_s16(vacc0x13, vprod0x13);
+        vacc1x13 = vpadalq_s16(vacc1x13, vprod1x13);
+        const int8x8_t vb14 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x14 = vmull_s8(vb14, va0);
+        const int16x8_t vprod1x14 = vmull_s8(vb14, va1);
+        vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14);
+        vacc1x14 = vpadalq_s16(vacc1x14, vprod1x14);
+        const int8x8_t vb15 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x15 = vmull_s8(vb15, va0);
+        const int16x8_t vprod1x15 = vmull_s8(vb15, va1);
+        vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15);
+        vacc1x15 = vpadalq_s16(vacc1x15, vprod1x15);
+
+        k -= 8 * sizeof(int8_t);
+      }
+
+      p -= 2 * sizeof(void*);
+    } while (p != 0);
+
+#if XNN_ARCH_ARM64
+    const int32x4_t vsum0x01 = vpaddq_s32(vacc0x0, vacc0x1);
+    const int32x4_t vsum0x23 = vpaddq_s32(vacc0x2, vacc0x3);
+    const int32x4_t vsum0x45 = vpaddq_s32(vacc0x4, vacc0x5);
+    const int32x4_t vsum0x67 = vpaddq_s32(vacc0x6, vacc0x7);
+    const int32x4_t vsum0x89 = vpaddq_s32(vacc0x8, vacc0x9);
+    const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11);
+    const int32x4_t vsum0xCD = vpaddq_s32(vacc0x12, vacc0x13);
+    const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15);
+    const int32x4_t vsum1x01 = vpaddq_s32(vacc1x0, vacc1x1);
+    const int32x4_t vsum1x23 = vpaddq_s32(vacc1x2, vacc1x3);
+    const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5);
+    const int32x4_t vsum1x67 = vpaddq_s32(vacc1x6, vacc1x7);
+    const int32x4_t vsum1x89 = vpaddq_s32(vacc1x8, vacc1x9);
+    const int32x4_t vsum1xAB = vpaddq_s32(vacc1x10, vacc1x11);
+    const int32x4_t vsum1xCD = vpaddq_s32(vacc1x12, vacc1x13);
+    const int32x4_t vsum1xEF = vpaddq_s32(vacc1x14, vacc1x15);
+    int32x4_t vacc0x0123 = vpaddq_s32(vsum0x01, vsum0x23);
+    int32x4_t vacc0x4567 = vpaddq_s32(vsum0x45, vsum0x67);
+    int32x4_t vacc0x89AB = vpaddq_s32(vsum0x89, vsum0xAB);
+    int32x4_t vacc0xCDEF = vpaddq_s32(vsum0xCD, vsum0xEF);
+    int32x4_t vacc1x0123 = vpaddq_s32(vsum1x01, vsum1x23);
+    int32x4_t vacc1x4567 = vpaddq_s32(vsum1x45, vsum1x67);
+    int32x4_t vacc1x89AB = vpaddq_s32(vsum1x89, vsum1xAB);
+    int32x4_t vacc1xCDEF = vpaddq_s32(vsum1xCD, vsum1xEF);
+#else
+    const int32x2_t vpsum0x0 = vadd_s32(vget_low_s32(vacc0x0), vget_high_s32(vacc0x0));
+    const int32x2_t vpsum0x1 = vadd_s32(vget_low_s32(vacc0x1), vget_high_s32(vacc0x1));
+    const int32x2_t vpsum0x2 = vadd_s32(vget_low_s32(vacc0x2), vget_high_s32(vacc0x2));
+    const int32x2_t vpsum0x3 = vadd_s32(vget_low_s32(vacc0x3), vget_high_s32(vacc0x3));
+    const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1);
+    const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3);
+    int32x4_t vacc0x0123 = vcombine_s32(vsum0x01, vsum0x23 );
+    const int32x2_t vpsum0x4 = vadd_s32(vget_low_s32(vacc0x4), vget_high_s32(vacc0x4));
+    const int32x2_t vpsum0x5 = vadd_s32(vget_low_s32(vacc0x5), vget_high_s32(vacc0x5));
+    const int32x2_t vpsum0x6 = vadd_s32(vget_low_s32(vacc0x6), vget_high_s32(vacc0x6));
+    const int32x2_t vpsum0x7 = vadd_s32(vget_low_s32(vacc0x7), vget_high_s32(vacc0x7));
+    const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5);
+    const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7);
+    int32x4_t vacc0x4567 = vcombine_s32(vsum0x45, vsum0x67 );
+    const int32x2_t vpsum0x8 = vadd_s32(vget_low_s32(vacc0x8), vget_high_s32(vacc0x8));
+    const int32x2_t vpsum0x9 = vadd_s32(vget_low_s32(vacc0x9), vget_high_s32(vacc0x9));
+    const int32x2_t vpsum0xA = vadd_s32(vget_low_s32(vacc0x10), vget_high_s32(vacc0x10));
+    const int32x2_t vpsum0xB = vadd_s32(vget_low_s32(vacc0x11), vget_high_s32(vacc0x11));
+    const int32x2_t vsum0x89 = vpadd_s32(vpsum0x8, vpsum0x9);
+    const int32x2_t vsum0xAB = vpadd_s32(vpsum0xA, vpsum0xB);
+    int32x4_t vacc0x89AB = vcombine_s32(vsum0x89, vsum0xAB );
+    const int32x2_t vpsum0xC = vadd_s32(vget_low_s32(vacc0x12), vget_high_s32(vacc0x12));
+    const int32x2_t vpsum0xD = vadd_s32(vget_low_s32(vacc0x13), vget_high_s32(vacc0x13));
+    const int32x2_t vpsum0xE = vadd_s32(vget_low_s32(vacc0x14), vget_high_s32(vacc0x14));
+    const int32x2_t vpsum0xF = vadd_s32(vget_low_s32(vacc0x15), vget_high_s32(vacc0x15));
+    const int32x2_t vsum0xCD = vpadd_s32(vpsum0xC, vpsum0xD);
+    const int32x2_t vsum0xEF = vpadd_s32(vpsum0xE, vpsum0xF);
+    int32x4_t vacc0xCDEF = vcombine_s32(vsum0xCD, vsum0xEF );
+    const int32x2_t vpsum1x0 = vadd_s32(vget_low_s32(vacc1x0), vget_high_s32(vacc1x0));
+    const int32x2_t vpsum1x1 = vadd_s32(vget_low_s32(vacc1x1), vget_high_s32(vacc1x1));
+    const int32x2_t vpsum1x2 = vadd_s32(vget_low_s32(vacc1x2), vget_high_s32(vacc1x2));
+    const int32x2_t vpsum1x3 = vadd_s32(vget_low_s32(vacc1x3), vget_high_s32(vacc1x3));
+    const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1);
+    const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3);
+    int32x4_t vacc1x0123 = vcombine_s32(vsum1x01, vsum1x23 );
+    const int32x2_t vpsum1x4 = vadd_s32(vget_low_s32(vacc1x4), vget_high_s32(vacc1x4));
+    const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5));
+    const int32x2_t vpsum1x6 = vadd_s32(vget_low_s32(vacc1x6), vget_high_s32(vacc1x6));
+    const int32x2_t vpsum1x7 = vadd_s32(vget_low_s32(vacc1x7), vget_high_s32(vacc1x7));
+    const int32x2_t vsum1x45 = vpadd_s32(vpsum1x4, vpsum1x5);
+    const int32x2_t vsum1x67 = vpadd_s32(vpsum1x6, vpsum1x7);
+    int32x4_t vacc1x4567 = vcombine_s32(vsum1x45, vsum1x67 );
+    const int32x2_t vpsum1x8 = vadd_s32(vget_low_s32(vacc1x8), vget_high_s32(vacc1x8));
+    const int32x2_t vpsum1x9 = vadd_s32(vget_low_s32(vacc1x9), vget_high_s32(vacc1x9));
+    const int32x2_t vpsum1xA = vadd_s32(vget_low_s32(vacc1x10), vget_high_s32(vacc1x10));
+    const int32x2_t vpsum1xB = vadd_s32(vget_low_s32(vacc1x11), vget_high_s32(vacc1x11));
+    const int32x2_t vsum1x89 = vpadd_s32(vpsum1x8, vpsum1x9);
+    const int32x2_t vsum1xAB = vpadd_s32(vpsum1xA, vpsum1xB);
+    int32x4_t vacc1x89AB = vcombine_s32(vsum1x89, vsum1xAB );
+    const int32x2_t vpsum1xC = vadd_s32(vget_low_s32(vacc1x12), vget_high_s32(vacc1x12));
+    const int32x2_t vpsum1xD = vadd_s32(vget_low_s32(vacc1x13), vget_high_s32(vacc1x13));
+    const int32x2_t vpsum1xE = vadd_s32(vget_low_s32(vacc1x14), vget_high_s32(vacc1x14));
+    const int32x2_t vpsum1xF = vadd_s32(vget_low_s32(vacc1x15), vget_high_s32(vacc1x15));
+    const int32x2_t vsum1xCD = vpadd_s32(vpsum1xC, vpsum1xD);
+    const int32x2_t vsum1xEF = vpadd_s32(vpsum1xE, vpsum1xF);
+    int32x4_t vacc1xCDEF = vcombine_s32(vsum1xCD, vsum1xEF );
+#endif
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+    vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc1x89AB = vqrdmulhq_s32(vacc1x89AB, vmultiplier);
+    vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+    vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc1x89AB = vsraq_n_s32(vacc1x89AB, vbicq_s32(vacc1x89AB, vzero_shift_mask), 31);
+    vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_shift);
+    vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+    int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+    int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
+    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+
+    vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
+    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+
+    if (nc >= 16) {
+      vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
+      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 16;
+    } else {
+      int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
+      if (nc & 8) {
+        vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
+        vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
+        vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
+      }
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-sse2-ld128.c b/src/qs8-igemm/gen/2x4c8-minmax-sse2-ld128.c
index 906861e..c771d14 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-sse2-ld128.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-sse2-ld128.c
@@ -12,6 +12,7 @@
 #include <emmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_2x4c8__sse2_ld128(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr != 2) {
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-sse2-ld64.c b/src/qs8-igemm/gen/2x4c8-minmax-sse2-ld64.c
index 3a728bd..e49603b 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-sse2-ld64.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-sse2-ld64.c
@@ -12,6 +12,7 @@
 #include <emmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_2x4c8__sse2_ld64(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr != 2) {
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-sse41-ld128.c b/src/qs8-igemm/gen/2x4c8-minmax-sse41-ld128.c
index 6d68d36..e5e18fd 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-sse41-ld128.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-sse41-ld128.c
@@ -12,6 +12,7 @@
 #include <smmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr != 2) {
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-sse41-ld64.c b/src/qs8-igemm/gen/2x4c8-minmax-sse41-ld64.c
index c7b7392..09d8a27 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-sse41-ld64.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-sse41-ld64.c
@@ -12,6 +12,7 @@
 #include <smmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr != 2) {
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-ssse3-ld128.c b/src/qs8-igemm/gen/2x4c8-minmax-ssse3-ld128.c
index 2b1d86a..d10dc86 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-ssse3-ld128.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-ssse3-ld128.c
@@ -12,6 +12,7 @@
 #include <tmmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_2x4c8__ssse3_ld128(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr != 2) {
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-ssse3-ld64.c b/src/qs8-igemm/gen/2x4c8-minmax-ssse3-ld64.c
index 15e19f9..2b97a3c 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-ssse3-ld64.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-ssse3-ld64.c
@@ -12,6 +12,7 @@
 #include <tmmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_2x4c8__ssse3_ld64(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr != 2) {
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-wasmsimd-ld128.c b/src/qs8-igemm/gen/2x4c8-minmax-wasmsimd-ld128.c
index 11188d4..3aa0434 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-wasmsimd-ld128.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-wasmsimd-ld128.c
@@ -12,6 +12,7 @@
 #include <wasm_simd128.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_2x4c8__wasmsimd_ld128(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr != 2) {
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-wasmsimd-ld64.c b/src/qs8-igemm/gen/2x4c8-minmax-wasmsimd-ld64.c
index a85d4d7..d2675e6 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-wasmsimd-ld64.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-wasmsimd-ld64.c
@@ -12,6 +12,7 @@
 #include <wasm_simd128.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_2x4c8__wasmsimd_ld64(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr != 2) {
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-xop-ld128.c b/src/qs8-igemm/gen/2x4c8-minmax-xop-ld128.c
index 00d6188..e58f425 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-xop-ld128.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-xop-ld128.c
@@ -17,6 +17,7 @@
 #endif
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128(
@@ -44,6 +45,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr != 2) {
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-xop-ld64.c b/src/qs8-igemm/gen/2x4c8-minmax-xop-ld64.c
index 70458bd..02bbdf4 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-xop-ld64.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-xop-ld64.c
@@ -17,6 +17,7 @@
 #endif
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64(
@@ -44,6 +45,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr != 2) {
diff --git a/src/qs8-igemm/gen/2x8-minmax-neon-mull-addw-dup.c b/src/qs8-igemm/gen/2x8-minmax-neon-mull-addw-dup.c
new file mode 100644
index 0000000..c1e4562
--- /dev/null
+++ b/src/qs8-igemm/gen/2x8-minmax-neon-mull-addw-dup.c
@@ -0,0 +1,287 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/neon-mull-addw-dup.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/gemm.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (2 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    c1 = c0;
+  }
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc1x0123 = vacc0x0123;
+    int32x4_t vacc1x4567 = vacc0x4567;
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      a += 2;
+
+      size_t k = kc;
+      while (k >= 8 * sizeof(int8_t)) {
+        const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+
+        const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c0));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c0));
+        const int16x8_t vprod1x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va1, 0));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c0));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c0));
+        const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c1));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c1));
+        const int16x8_t vprod1x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va1, 1));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c1));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c1));
+        const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va0, 2));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c2));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c2));
+        const int16x8_t vprod1x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va1, 2));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c2));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c2));
+        const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3));
+        const int16x8_t vprod1x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va1, 3));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c3));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c3));
+        const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va0, 4));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c4));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c4));
+        const int16x8_t vprod1x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va1, 4));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c4));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c4));
+        const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va0, 5));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c5));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c5));
+        const int16x8_t vprod1x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va1, 5));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c5));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c5));
+        const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va0, 6));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c6));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c6));
+        const int16x8_t vprod1x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va1, 6));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c6));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c6));
+        const int8x8_t vb01234567c7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va0, 7));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c7));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c7));
+        const int16x8_t vprod1x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va1, 7));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c7));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c7));
+
+        k -= 8 * sizeof(int8_t);
+      }
+      if XNN_UNLIKELY(k != 0) {
+        const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+        const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+
+        const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c0));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c0));
+        const int16x8_t vprod1x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va1, 0));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c0));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c0));
+
+        if (k >= 2 * sizeof(int8_t)) {
+          const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1));
+          vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c1));
+          vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c1));
+          const int16x8_t vprod1x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va1, 1));
+          vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c1));
+          vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c1));
+
+          if (k > 2 * sizeof(int8_t)) {
+            const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+            const int16x8_t vprod0x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va0, 2));
+            vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c2));
+            vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c2));
+            const int16x8_t vprod1x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va1, 2));
+            vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c2));
+            vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c2));
+
+            if (k >= 4 * sizeof(int8_t)) {
+              const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+              const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3));
+              vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3));
+              vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3));
+              const int16x8_t vprod1x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va1, 3));
+              vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c3));
+              vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c3));
+
+              if (k > 4 * sizeof(int8_t)) {
+                const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+                const int16x8_t vprod0x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va0, 4));
+                vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c4));
+                vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c4));
+                const int16x8_t vprod1x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va1, 4));
+                vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c4));
+                vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c4));
+
+                if (k >= 6 * sizeof(int8_t)) {
+                  const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+                  const int16x8_t vprod0x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va0, 5));
+                  vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c5));
+                  vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c5));
+                  const int16x8_t vprod1x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va1, 5));
+                  vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c5));
+                  vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c5));
+
+                  if (k > 6 * sizeof(int8_t)) {
+                    const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+                    const int16x8_t vprod0x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va0, 6));
+                    vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c6));
+                    vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c6));
+                    const int16x8_t vprod1x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va1, 6));
+                    vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c6));
+                    vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c6));
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+      p -= 2 * sizeof(void*);
+    } while (p != 0);
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);
+
+    vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);
+
+    if (nc >= 8) {
+      vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
+      vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
+
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/2x8c16-minmax-neon-mlal-padal.c b/src/qs8-igemm/gen/2x8c16-minmax-neon-mlal-padal.c
new file mode 100644
index 0000000..bc9ba0e
--- /dev/null
+++ b/src/qs8-igemm/gen/2x8c16-minmax-neon-mlal-padal.c
@@ -0,0 +1,259 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/c16-neon-mlal-padal.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (2 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 16);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    c1 = c0;
+  }
+
+  do {
+    int32x4_t vacc0x0 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x1 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x2 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x3 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x4 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x5 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc1x0 = vacc0x0;
+    int32x4_t vacc1x1 = vacc0x1;
+    int32x4_t vacc1x2 = vacc0x2;
+    int32x4_t vacc1x3 = vacc0x3;
+    int32x4_t vacc1x4 = vacc0x4;
+    int32x4_t vacc1x5 = vacc0x5;
+    int32x4_t vacc1x6 = vacc0x6;
+    int32x4_t vacc1x7 = vacc0x7;
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      a += 2;
+
+      // KC loop of 16 with up to 15 remainder
+      size_t k = 0;
+      while (k < kc) {
+        const int8x16_t va0 = vld1q_s8(a0); a0 += 16;
+        const int8x16_t va1 = vld1q_s8(a1); a1 += 16;
+
+        const int8x16_t vb0 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb1 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb2 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb3 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb4 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb5 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb6 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb7 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+
+        int16x8_t vprod0x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va0));
+        int16x8_t vprod1x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va1));
+        vprod0x0 = vmlal_s8(vprod0x0, vget_high_s8(vb0), vget_high_s8(va0));
+        vprod1x0 = vmlal_s8(vprod1x0, vget_high_s8(vb0), vget_high_s8(va1));
+        vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+        vacc1x0 = vpadalq_s16(vacc1x0, vprod1x0);
+        int16x8_t vprod0x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va0));
+        int16x8_t vprod1x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va1));
+        vprod0x1 = vmlal_s8(vprod0x1, vget_high_s8(vb1), vget_high_s8(va0));
+        vprod1x1 = vmlal_s8(vprod1x1, vget_high_s8(vb1), vget_high_s8(va1));
+        vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+        vacc1x1 = vpadalq_s16(vacc1x1, vprod1x1);
+        int16x8_t vprod0x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va0));
+        int16x8_t vprod1x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va1));
+        vprod0x2 = vmlal_s8(vprod0x2, vget_high_s8(vb2), vget_high_s8(va0));
+        vprod1x2 = vmlal_s8(vprod1x2, vget_high_s8(vb2), vget_high_s8(va1));
+        vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+        vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2);
+        int16x8_t vprod0x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va0));
+        int16x8_t vprod1x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va1));
+        vprod0x3 = vmlal_s8(vprod0x3, vget_high_s8(vb3), vget_high_s8(va0));
+        vprod1x3 = vmlal_s8(vprod1x3, vget_high_s8(vb3), vget_high_s8(va1));
+        vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+        vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3);
+        int16x8_t vprod0x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va0));
+        int16x8_t vprod1x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va1));
+        vprod0x4 = vmlal_s8(vprod0x4, vget_high_s8(vb4), vget_high_s8(va0));
+        vprod1x4 = vmlal_s8(vprod1x4, vget_high_s8(vb4), vget_high_s8(va1));
+        vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+        vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4);
+        int16x8_t vprod0x5 = vmull_s8(vget_low_s8(vb5), vget_low_s8(va0));
+        int16x8_t vprod1x5 = vmull_s8(vget_low_s8(vb5), vget_low_s8(va1));
+        vprod0x5 = vmlal_s8(vprod0x5, vget_high_s8(vb5), vget_high_s8(va0));
+        vprod1x5 = vmlal_s8(vprod1x5, vget_high_s8(vb5), vget_high_s8(va1));
+        vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+        vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5);
+        int16x8_t vprod0x6 = vmull_s8(vget_low_s8(vb6), vget_low_s8(va0));
+        int16x8_t vprod1x6 = vmull_s8(vget_low_s8(vb6), vget_low_s8(va1));
+        vprod0x6 = vmlal_s8(vprod0x6, vget_high_s8(vb6), vget_high_s8(va0));
+        vprod1x6 = vmlal_s8(vprod1x6, vget_high_s8(vb6), vget_high_s8(va1));
+        vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+        vacc1x6 = vpadalq_s16(vacc1x6, vprod1x6);
+        int16x8_t vprod0x7 = vmull_s8(vget_low_s8(vb7), vget_low_s8(va0));
+        int16x8_t vprod1x7 = vmull_s8(vget_low_s8(vb7), vget_low_s8(va1));
+        vprod0x7 = vmlal_s8(vprod0x7, vget_high_s8(vb7), vget_high_s8(va0));
+        vprod1x7 = vmlal_s8(vprod1x7, vget_high_s8(vb7), vget_high_s8(va1));
+        vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+        vacc1x7 = vpadalq_s16(vacc1x7, vprod1x7);
+
+        k += 16 * sizeof(int8_t);
+      }
+
+      p -= 2 * sizeof(void*);
+    } while (p != 0);
+
+#if XNN_ARCH_ARM64
+    const int32x4_t vsum0x01 = vpaddq_s32(vacc0x0, vacc0x1);
+    const int32x4_t vsum0x23 = vpaddq_s32(vacc0x2, vacc0x3);
+    const int32x4_t vsum0x45 = vpaddq_s32(vacc0x4, vacc0x5);
+    const int32x4_t vsum0x67 = vpaddq_s32(vacc0x6, vacc0x7);
+    const int32x4_t vsum1x01 = vpaddq_s32(vacc1x0, vacc1x1);
+    const int32x4_t vsum1x23 = vpaddq_s32(vacc1x2, vacc1x3);
+    const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5);
+    const int32x4_t vsum1x67 = vpaddq_s32(vacc1x6, vacc1x7);
+    int32x4_t vacc0x0123 = vpaddq_s32(vsum0x01, vsum0x23);
+    int32x4_t vacc0x4567 = vpaddq_s32(vsum0x45, vsum0x67);
+    int32x4_t vacc1x0123 = vpaddq_s32(vsum1x01, vsum1x23);
+    int32x4_t vacc1x4567 = vpaddq_s32(vsum1x45, vsum1x67);
+#else
+    const int32x2_t vpsum0x0 = vadd_s32(vget_low_s32(vacc0x0), vget_high_s32(vacc0x0));
+    const int32x2_t vpsum0x1 = vadd_s32(vget_low_s32(vacc0x1), vget_high_s32(vacc0x1));
+    const int32x2_t vpsum0x2 = vadd_s32(vget_low_s32(vacc0x2), vget_high_s32(vacc0x2));
+    const int32x2_t vpsum0x3 = vadd_s32(vget_low_s32(vacc0x3), vget_high_s32(vacc0x3));
+    const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1);
+    const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3);
+    int32x4_t vacc0x0123 = vcombine_s32(vsum0x01, vsum0x23 );
+    const int32x2_t vpsum0x4 = vadd_s32(vget_low_s32(vacc0x4), vget_high_s32(vacc0x4));
+    const int32x2_t vpsum0x5 = vadd_s32(vget_low_s32(vacc0x5), vget_high_s32(vacc0x5));
+    const int32x2_t vpsum0x6 = vadd_s32(vget_low_s32(vacc0x6), vget_high_s32(vacc0x6));
+    const int32x2_t vpsum0x7 = vadd_s32(vget_low_s32(vacc0x7), vget_high_s32(vacc0x7));
+    const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5);
+    const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7);
+    int32x4_t vacc0x4567 = vcombine_s32(vsum0x45, vsum0x67 );
+    const int32x2_t vpsum1x0 = vadd_s32(vget_low_s32(vacc1x0), vget_high_s32(vacc1x0));
+    const int32x2_t vpsum1x1 = vadd_s32(vget_low_s32(vacc1x1), vget_high_s32(vacc1x1));
+    const int32x2_t vpsum1x2 = vadd_s32(vget_low_s32(vacc1x2), vget_high_s32(vacc1x2));
+    const int32x2_t vpsum1x3 = vadd_s32(vget_low_s32(vacc1x3), vget_high_s32(vacc1x3));
+    const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1);
+    const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3);
+    int32x4_t vacc1x0123 = vcombine_s32(vsum1x01, vsum1x23 );
+    const int32x2_t vpsum1x4 = vadd_s32(vget_low_s32(vacc1x4), vget_high_s32(vacc1x4));
+    const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5));
+    const int32x2_t vpsum1x6 = vadd_s32(vget_low_s32(vacc1x6), vget_high_s32(vacc1x6));
+    const int32x2_t vpsum1x7 = vadd_s32(vget_low_s32(vacc1x7), vget_high_s32(vacc1x7));
+    const int32x2_t vsum1x45 = vpadd_s32(vpsum1x4, vpsum1x5);
+    const int32x2_t vsum1x67 = vpadd_s32(vpsum1x6, vpsum1x7);
+    int32x4_t vacc1x4567 = vcombine_s32(vsum1x45, vsum1x67 );
+#endif
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);
+
+    vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);
+
+    if (nc >= 8) {
+      vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
+      vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
+
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/2x8c2-minmax-neon-mlal-padal-dup.c b/src/qs8-igemm/gen/2x8c2-minmax-neon-mlal-padal-dup.c
new file mode 100644
index 0000000..f48331d
--- /dev/null
+++ b/src/qs8-igemm/gen/2x8c2-minmax-neon-mlal-padal-dup.c
@@ -0,0 +1,307 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/c2-neon-mull-padal-dup.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (2 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    c1 = c0;
+  }
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc1x0123 = vacc0x0123;
+    int32x4_t vacc1x4567 = vacc0x4567;
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      a += 2;
+
+      size_t k = kc;
+
+      while (k >= 16 * sizeof(int8_t)) {
+        const int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va0x1 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va1x0 = vld1_s8(a1); a1 += 8;
+        const int8x8_t va1x1 = vld1_s8(a1); a1 += 8;
+
+        const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+        int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+        const int8x8_t vb0123c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+        vprod1x0123c0 = vmlal_s8(vprod1x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+        int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+        int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+        const int8x8_t vb4567c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+        vprod1x4567c0 = vmlal_s8(vprod1x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+        int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+        int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+        const int8x8_t vb0123c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x0123c1 = vmlal_s8(vprod0x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+        vprod1x0123c1 = vmlal_s8(vprod1x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+        int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+        int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+        const int8x8_t vb4567c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x4567c1 = vmlal_s8(vprod0x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+        vprod1x4567c1 = vmlal_s8(vprod1x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+        int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+        int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+        const int8x8_t vb0123c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x0123c2 = vmlal_s8(vprod0x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+        vprod1x0123c2 = vmlal_s8(vprod1x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+        int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+        int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+        const int8x8_t vb4567c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x4567c2 = vmlal_s8(vprod0x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+        vprod1x4567c2 = vmlal_s8(vprod1x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+        int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+        int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+        const int8x8_t vb0123c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x0123c3 = vmlal_s8(vprod0x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+        vprod1x0123c3 = vmlal_s8(vprod1x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+        int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+        int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+        const int8x8_t vb4567c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x4567c3 = vmlal_s8(vprod0x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+        vprod1x4567c3 = vmlal_s8(vprod1x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+
+        k -= 16 * sizeof(int8_t);
+      }
+
+      if (k >= 8 * sizeof(int8_t)) {
+        const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+
+        const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+        const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+        const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+        const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+        const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+        const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+        const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+        const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+
+        k -= 8 * sizeof(int8_t);
+      }
+
+      if XNN_UNLIKELY(k != 0) {
+        const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+        const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+
+        const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+        const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+        const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+        const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+
+        if (k > 2 * sizeof(int8_t)) {
+          const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+          vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+          const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+          vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+          const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+          vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+          const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+          vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+
+          if (k > 4 * sizeof(int8_t)) {
+            const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+            const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+            vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+            const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+            vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+            const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+            vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+            const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+            vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+          }
+        }
+      }
+      p -= 2 * sizeof(void*);
+    } while (p != 0);
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);
+
+    vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);
+
+    if (nc >= 8) {
+      vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
+      vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
+
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/2x8c2-minmax-neon-mull-padal-dup.c b/src/qs8-igemm/gen/2x8c2-minmax-neon-mull-padal-dup.c
new file mode 100644
index 0000000..5178132
--- /dev/null
+++ b/src/qs8-igemm/gen/2x8c2-minmax-neon-mull-padal-dup.c
@@ -0,0 +1,233 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/c2-neon-mull-padal-dup.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (2 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    c1 = c0;
+  }
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc1x0123 = vacc0x0123;
+    int32x4_t vacc1x4567 = vacc0x4567;
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      a += 2;
+
+      size_t k = kc;
+
+
+      while (k >= 8 * sizeof(int8_t)) {
+        const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+
+        const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+        const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+        const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+        const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+        const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+        const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+        const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+        const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+
+        k -= 8 * sizeof(int8_t);
+      }
+
+      if XNN_UNLIKELY(k != 0) {
+        const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+        const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+
+        const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+        const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+        const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+        const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+
+        if (k > 2 * sizeof(int8_t)) {
+          const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+          vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+          const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+          vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+          const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+          vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+          const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+          vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+
+          if (k > 4 * sizeof(int8_t)) {
+            const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+            const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+            vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+            const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+            vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+            const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+            vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+            const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+            vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+          }
+        }
+      }
+      p -= 2 * sizeof(void*);
+    } while (p != 0);
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);
+
+    vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);
+
+    if (nc >= 8) {
+      vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
+      vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
+
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/2x8c8-minmax-avx2.c b/src/qs8-igemm/gen/2x8c8-minmax-avx2.c
index d3060c3..9291845 100644
--- a/src/qs8-igemm/gen/2x8c8-minmax-avx2.c
+++ b/src/qs8-igemm/gen/2x8c8-minmax-avx2.c
@@ -13,6 +13,7 @@
 
 #include <xnnpack/igemm.h>
 #include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2(
@@ -40,6 +41,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr != 2) {
diff --git a/src/qs8-igemm/gen/2x8c8-minmax-neon-mlal-padal.c b/src/qs8-igemm/gen/2x8c8-minmax-neon-mlal-padal.c
new file mode 100644
index 0000000..ae55019
--- /dev/null
+++ b/src/qs8-igemm/gen/2x8c8-minmax-neon-mlal-padal.c
@@ -0,0 +1,318 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/c8-neon-mull-padal.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (2 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    c1 = c0;
+  }
+
+  do {
+    int32x4_t vacc0x0 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x1 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x2 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x3 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x4 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x5 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc1x0 = vacc0x0;
+    int32x4_t vacc1x1 = vacc0x1;
+    int32x4_t vacc1x2 = vacc0x2;
+    int32x4_t vacc1x3 = vacc0x3;
+    int32x4_t vacc1x4 = vacc0x4;
+    int32x4_t vacc1x5 = vacc0x5;
+    int32x4_t vacc1x6 = vacc0x6;
+    int32x4_t vacc1x7 = vacc0x7;
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      a += 2;
+
+      size_t k = kc;
+      // 2x partial unrolled loop to load 16 bytes at a time using MLA.
+      while (k >= 16 * sizeof(int8_t)) {
+        const int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va0x1 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va1x0 = vld1_s8(a1); a1 += 8;
+        const int8x8_t va1x1 = vld1_s8(a1); a1 += 8;
+
+        const int8x8_t vb0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb4x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb5x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb6x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb7x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+
+        const int8x8_t vb0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x0 = vmull_s8(vb0x0, va0x0);
+        int16x8_t vprod1x0 = vmull_s8(vb0x0, va1x0);
+        vprod0x0 = vmlal_s8(vprod0x0, vb0x1, va0x1);
+        vprod1x0 = vmlal_s8(vprod1x0, vb0x1, va1x1);
+        vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+        vacc1x0 = vpadalq_s16(vacc1x0, vprod1x0);
+        const int8x8_t vb1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x1 = vmull_s8(vb1x0, va0x0);
+        int16x8_t vprod1x1 = vmull_s8(vb1x0, va1x0);
+        vprod0x1 = vmlal_s8(vprod0x1, vb1x1, va0x1);
+        vprod1x1 = vmlal_s8(vprod1x1, vb1x1, va1x1);
+        vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+        vacc1x1 = vpadalq_s16(vacc1x1, vprod1x1);
+        const int8x8_t vb2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x2 = vmull_s8(vb2x0, va0x0);
+        int16x8_t vprod1x2 = vmull_s8(vb2x0, va1x0);
+        vprod0x2 = vmlal_s8(vprod0x2, vb2x1, va0x1);
+        vprod1x2 = vmlal_s8(vprod1x2, vb2x1, va1x1);
+        vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+        vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2);
+        const int8x8_t vb3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x3 = vmull_s8(vb3x0, va0x0);
+        int16x8_t vprod1x3 = vmull_s8(vb3x0, va1x0);
+        vprod0x3 = vmlal_s8(vprod0x3, vb3x1, va0x1);
+        vprod1x3 = vmlal_s8(vprod1x3, vb3x1, va1x1);
+        vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+        vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3);
+        const int8x8_t vb4x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x4 = vmull_s8(vb4x0, va0x0);
+        int16x8_t vprod1x4 = vmull_s8(vb4x0, va1x0);
+        vprod0x4 = vmlal_s8(vprod0x4, vb4x1, va0x1);
+        vprod1x4 = vmlal_s8(vprod1x4, vb4x1, va1x1);
+        vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+        vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4);
+        const int8x8_t vb5x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x5 = vmull_s8(vb5x0, va0x0);
+        int16x8_t vprod1x5 = vmull_s8(vb5x0, va1x0);
+        vprod0x5 = vmlal_s8(vprod0x5, vb5x1, va0x1);
+        vprod1x5 = vmlal_s8(vprod1x5, vb5x1, va1x1);
+        vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+        vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5);
+        const int8x8_t vb6x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x6 = vmull_s8(vb6x0, va0x0);
+        int16x8_t vprod1x6 = vmull_s8(vb6x0, va1x0);
+        vprod0x6 = vmlal_s8(vprod0x6, vb6x1, va0x1);
+        vprod1x6 = vmlal_s8(vprod1x6, vb6x1, va1x1);
+        vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+        vacc1x6 = vpadalq_s16(vacc1x6, vprod1x6);
+        const int8x8_t vb7x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x7 = vmull_s8(vb7x0, va0x0);
+        int16x8_t vprod1x7 = vmull_s8(vb7x0, va1x0);
+        vprod0x7 = vmlal_s8(vprod0x7, vb7x1, va0x1);
+        vprod1x7 = vmlal_s8(vprod1x7, vb7x1, va1x1);
+        vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+        vacc1x7 = vpadalq_s16(vacc1x7, vprod1x7);
+
+        k -= 16 * sizeof(int8_t);
+      }
+
+      // Handle 8 bytes at a time using MUL.
+      if (k > 0) {
+        const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+
+        const int8x8_t vb0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x0 = vmull_s8(vb0, va0);
+        const int16x8_t vprod1x0 = vmull_s8(vb0, va1);
+        vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+        vacc1x0 = vpadalq_s16(vacc1x0, vprod1x0);
+        const int8x8_t vb1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x1 = vmull_s8(vb1, va0);
+        const int16x8_t vprod1x1 = vmull_s8(vb1, va1);
+        vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+        vacc1x1 = vpadalq_s16(vacc1x1, vprod1x1);
+        const int8x8_t vb2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x2 = vmull_s8(vb2, va0);
+        const int16x8_t vprod1x2 = vmull_s8(vb2, va1);
+        vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+        vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2);
+        const int8x8_t vb3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x3 = vmull_s8(vb3, va0);
+        const int16x8_t vprod1x3 = vmull_s8(vb3, va1);
+        vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+        vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3);
+        const int8x8_t vb4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x4 = vmull_s8(vb4, va0);
+        const int16x8_t vprod1x4 = vmull_s8(vb4, va1);
+        vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+        vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4);
+        const int8x8_t vb5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x5 = vmull_s8(vb5, va0);
+        const int16x8_t vprod1x5 = vmull_s8(vb5, va1);
+        vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+        vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5);
+        const int8x8_t vb6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x6 = vmull_s8(vb6, va0);
+        const int16x8_t vprod1x6 = vmull_s8(vb6, va1);
+        vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+        vacc1x6 = vpadalq_s16(vacc1x6, vprod1x6);
+        const int8x8_t vb7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x7 = vmull_s8(vb7, va0);
+        const int16x8_t vprod1x7 = vmull_s8(vb7, va1);
+        vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+        vacc1x7 = vpadalq_s16(vacc1x7, vprod1x7);
+
+        k -= 8 * sizeof(int8_t);
+      }
+
+      p -= 2 * sizeof(void*);
+    } while (p != 0);
+
+#if XNN_ARCH_ARM64
+    const int32x4_t vsum0x01 = vpaddq_s32(vacc0x0, vacc0x1);
+    const int32x4_t vsum0x23 = vpaddq_s32(vacc0x2, vacc0x3);
+    const int32x4_t vsum0x45 = vpaddq_s32(vacc0x4, vacc0x5);
+    const int32x4_t vsum0x67 = vpaddq_s32(vacc0x6, vacc0x7);
+    const int32x4_t vsum1x01 = vpaddq_s32(vacc1x0, vacc1x1);
+    const int32x4_t vsum1x23 = vpaddq_s32(vacc1x2, vacc1x3);
+    const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5);
+    const int32x4_t vsum1x67 = vpaddq_s32(vacc1x6, vacc1x7);
+    int32x4_t vacc0x0123 = vpaddq_s32(vsum0x01, vsum0x23);
+    int32x4_t vacc0x4567 = vpaddq_s32(vsum0x45, vsum0x67);
+    int32x4_t vacc1x0123 = vpaddq_s32(vsum1x01, vsum1x23);
+    int32x4_t vacc1x4567 = vpaddq_s32(vsum1x45, vsum1x67);
+#else
+    const int32x2_t vpsum0x0 = vadd_s32(vget_low_s32(vacc0x0), vget_high_s32(vacc0x0));
+    const int32x2_t vpsum0x1 = vadd_s32(vget_low_s32(vacc0x1), vget_high_s32(vacc0x1));
+    const int32x2_t vpsum0x2 = vadd_s32(vget_low_s32(vacc0x2), vget_high_s32(vacc0x2));
+    const int32x2_t vpsum0x3 = vadd_s32(vget_low_s32(vacc0x3), vget_high_s32(vacc0x3));
+    const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1);
+    const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3);
+    int32x4_t vacc0x0123 = vcombine_s32(vsum0x01, vsum0x23 );
+    const int32x2_t vpsum0x4 = vadd_s32(vget_low_s32(vacc0x4), vget_high_s32(vacc0x4));
+    const int32x2_t vpsum0x5 = vadd_s32(vget_low_s32(vacc0x5), vget_high_s32(vacc0x5));
+    const int32x2_t vpsum0x6 = vadd_s32(vget_low_s32(vacc0x6), vget_high_s32(vacc0x6));
+    const int32x2_t vpsum0x7 = vadd_s32(vget_low_s32(vacc0x7), vget_high_s32(vacc0x7));
+    const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5);
+    const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7);
+    int32x4_t vacc0x4567 = vcombine_s32(vsum0x45, vsum0x67 );
+    const int32x2_t vpsum1x0 = vadd_s32(vget_low_s32(vacc1x0), vget_high_s32(vacc1x0));
+    const int32x2_t vpsum1x1 = vadd_s32(vget_low_s32(vacc1x1), vget_high_s32(vacc1x1));
+    const int32x2_t vpsum1x2 = vadd_s32(vget_low_s32(vacc1x2), vget_high_s32(vacc1x2));
+    const int32x2_t vpsum1x3 = vadd_s32(vget_low_s32(vacc1x3), vget_high_s32(vacc1x3));
+    const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1);
+    const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3);
+    int32x4_t vacc1x0123 = vcombine_s32(vsum1x01, vsum1x23 );
+    const int32x2_t vpsum1x4 = vadd_s32(vget_low_s32(vacc1x4), vget_high_s32(vacc1x4));
+    const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5));
+    const int32x2_t vpsum1x6 = vadd_s32(vget_low_s32(vacc1x6), vget_high_s32(vacc1x6));
+    const int32x2_t vpsum1x7 = vadd_s32(vget_low_s32(vacc1x7), vget_high_s32(vacc1x7));
+    const int32x2_t vsum1x45 = vpadd_s32(vpsum1x4, vpsum1x5);
+    const int32x2_t vsum1x67 = vpadd_s32(vpsum1x6, vpsum1x7);
+    int32x4_t vacc1x4567 = vcombine_s32(vsum1x45, vsum1x67 );
+#endif
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);
+
+    vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);
+
+    if (nc >= 8) {
+      vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
+      vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
+
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/2x8c8-minmax-neon-mull-padal.c b/src/qs8-igemm/gen/2x8c8-minmax-neon-mull-padal.c
new file mode 100644
index 0000000..03bbff3
--- /dev/null
+++ b/src/qs8-igemm/gen/2x8c8-minmax-neon-mull-padal.c
@@ -0,0 +1,243 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/c8-neon-mull-padal.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mull_padal(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (2 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    c1 = c0;
+  }
+
+  do {
+    int32x4_t vacc0x0 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x1 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x2 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x3 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x4 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x5 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc1x0 = vacc0x0;
+    int32x4_t vacc1x1 = vacc0x1;
+    int32x4_t vacc1x2 = vacc0x2;
+    int32x4_t vacc1x3 = vacc0x3;
+    int32x4_t vacc1x4 = vacc0x4;
+    int32x4_t vacc1x5 = vacc0x5;
+    int32x4_t vacc1x6 = vacc0x6;
+    int32x4_t vacc1x7 = vacc0x7;
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      a += 2;
+
+      size_t k = kc;
+
+      // Handle 8 bytes at a time using MUL.
+      while (k > 0) {
+        const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+
+        const int8x8_t vb0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x0 = vmull_s8(vb0, va0);
+        const int16x8_t vprod1x0 = vmull_s8(vb0, va1);
+        vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+        vacc1x0 = vpadalq_s16(vacc1x0, vprod1x0);
+        const int8x8_t vb1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x1 = vmull_s8(vb1, va0);
+        const int16x8_t vprod1x1 = vmull_s8(vb1, va1);
+        vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+        vacc1x1 = vpadalq_s16(vacc1x1, vprod1x1);
+        const int8x8_t vb2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x2 = vmull_s8(vb2, va0);
+        const int16x8_t vprod1x2 = vmull_s8(vb2, va1);
+        vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+        vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2);
+        const int8x8_t vb3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x3 = vmull_s8(vb3, va0);
+        const int16x8_t vprod1x3 = vmull_s8(vb3, va1);
+        vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+        vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3);
+        const int8x8_t vb4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x4 = vmull_s8(vb4, va0);
+        const int16x8_t vprod1x4 = vmull_s8(vb4, va1);
+        vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+        vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4);
+        const int8x8_t vb5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x5 = vmull_s8(vb5, va0);
+        const int16x8_t vprod1x5 = vmull_s8(vb5, va1);
+        vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+        vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5);
+        const int8x8_t vb6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x6 = vmull_s8(vb6, va0);
+        const int16x8_t vprod1x6 = vmull_s8(vb6, va1);
+        vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+        vacc1x6 = vpadalq_s16(vacc1x6, vprod1x6);
+        const int8x8_t vb7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x7 = vmull_s8(vb7, va0);
+        const int16x8_t vprod1x7 = vmull_s8(vb7, va1);
+        vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+        vacc1x7 = vpadalq_s16(vacc1x7, vprod1x7);
+
+        k -= 8 * sizeof(int8_t);
+      }
+
+      p -= 2 * sizeof(void*);
+    } while (p != 0);
+
+#if XNN_ARCH_ARM64
+    const int32x4_t vsum0x01 = vpaddq_s32(vacc0x0, vacc0x1);
+    const int32x4_t vsum0x23 = vpaddq_s32(vacc0x2, vacc0x3);
+    const int32x4_t vsum0x45 = vpaddq_s32(vacc0x4, vacc0x5);
+    const int32x4_t vsum0x67 = vpaddq_s32(vacc0x6, vacc0x7);
+    const int32x4_t vsum1x01 = vpaddq_s32(vacc1x0, vacc1x1);
+    const int32x4_t vsum1x23 = vpaddq_s32(vacc1x2, vacc1x3);
+    const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5);
+    const int32x4_t vsum1x67 = vpaddq_s32(vacc1x6, vacc1x7);
+    int32x4_t vacc0x0123 = vpaddq_s32(vsum0x01, vsum0x23);
+    int32x4_t vacc0x4567 = vpaddq_s32(vsum0x45, vsum0x67);
+    int32x4_t vacc1x0123 = vpaddq_s32(vsum1x01, vsum1x23);
+    int32x4_t vacc1x4567 = vpaddq_s32(vsum1x45, vsum1x67);
+#else
+    const int32x2_t vpsum0x0 = vadd_s32(vget_low_s32(vacc0x0), vget_high_s32(vacc0x0));
+    const int32x2_t vpsum0x1 = vadd_s32(vget_low_s32(vacc0x1), vget_high_s32(vacc0x1));
+    const int32x2_t vpsum0x2 = vadd_s32(vget_low_s32(vacc0x2), vget_high_s32(vacc0x2));
+    const int32x2_t vpsum0x3 = vadd_s32(vget_low_s32(vacc0x3), vget_high_s32(vacc0x3));
+    const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1);
+    const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3);
+    int32x4_t vacc0x0123 = vcombine_s32(vsum0x01, vsum0x23 );
+    const int32x2_t vpsum0x4 = vadd_s32(vget_low_s32(vacc0x4), vget_high_s32(vacc0x4));
+    const int32x2_t vpsum0x5 = vadd_s32(vget_low_s32(vacc0x5), vget_high_s32(vacc0x5));
+    const int32x2_t vpsum0x6 = vadd_s32(vget_low_s32(vacc0x6), vget_high_s32(vacc0x6));
+    const int32x2_t vpsum0x7 = vadd_s32(vget_low_s32(vacc0x7), vget_high_s32(vacc0x7));
+    const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5);
+    const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7);
+    int32x4_t vacc0x4567 = vcombine_s32(vsum0x45, vsum0x67 );
+    const int32x2_t vpsum1x0 = vadd_s32(vget_low_s32(vacc1x0), vget_high_s32(vacc1x0));
+    const int32x2_t vpsum1x1 = vadd_s32(vget_low_s32(vacc1x1), vget_high_s32(vacc1x1));
+    const int32x2_t vpsum1x2 = vadd_s32(vget_low_s32(vacc1x2), vget_high_s32(vacc1x2));
+    const int32x2_t vpsum1x3 = vadd_s32(vget_low_s32(vacc1x3), vget_high_s32(vacc1x3));
+    const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1);
+    const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3);
+    int32x4_t vacc1x0123 = vcombine_s32(vsum1x01, vsum1x23 );
+    const int32x2_t vpsum1x4 = vadd_s32(vget_low_s32(vacc1x4), vget_high_s32(vacc1x4));
+    const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5));
+    const int32x2_t vpsum1x6 = vadd_s32(vget_low_s32(vacc1x6), vget_high_s32(vacc1x6));
+    const int32x2_t vpsum1x7 = vadd_s32(vget_low_s32(vacc1x7), vget_high_s32(vacc1x7));
+    const int32x2_t vsum1x45 = vpadd_s32(vpsum1x4, vpsum1x5);
+    const int32x2_t vsum1x67 = vpadd_s32(vpsum1x6, vpsum1x7);
+    int32x4_t vacc1x4567 = vcombine_s32(vsum1x45, vsum1x67 );
+#endif
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);
+
+    vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);
+
+    if (nc >= 8) {
+      vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
+      vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
+
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/3x16-minmax-neon-mlal-lane.c b/src/qs8-igemm/gen/3x16-minmax-neon-mlal-lane.c
new file mode 100644
index 0000000..776e18f
--- /dev/null
+++ b/src/qs8-igemm/gen/3x16-minmax-neon-mlal-lane.c
@@ -0,0 +1,511 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/neon-mlal-lane.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/igemm.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (3 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc1x0123 = vacc0x0123;
+    int32x4_t vacc1x4567 = vacc0x4567;
+    int32x4_t vacc1x89AB = vacc0x89AB;
+    int32x4_t vacc1xCDEF = vacc0xCDEF;
+    int32x4_t vacc2x0123 = vacc0x0123;
+    int32x4_t vacc2x4567 = vacc0x4567;
+    int32x4_t vacc2x89AB = vacc0x89AB;
+    int32x4_t vacc2xCDEF = vacc0xCDEF;
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      a += 3;
+
+      size_t k = kc;
+      while (k >= 8 * sizeof(int8_t)) {
+        const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+        const int16x8_t vxa0 = vmovl_s8(va0);
+        const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+        const int16x8_t vxa1 = vmovl_s8(va1);
+        const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+        const int16x8_t vxa2 = vmovl_s8(va2);
+
+        const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb01234567c0 = vmovl_s8(vb01234567c0);
+
+        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
+        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
+        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
+        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
+        const int8x8_t vb89ABCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0);
+
+        vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
+        vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
+        vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0);
+        vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0);
+        vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0);
+        vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0);
+        const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb01234567c1 = vmovl_s8(vb01234567c1);
+
+        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
+        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
+        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
+        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
+        const int8x8_t vb89ABCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1);
+
+        vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
+        vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
+        vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1);
+        vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1);
+        vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1);
+        vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1);
+        const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb01234567c2 = vmovl_s8(vb01234567c2);
+
+        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
+        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
+        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
+        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
+        const int8x8_t vb89ABCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb89ABCDEFc2 = vmovl_s8(vb89ABCDEFc2);
+
+        vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
+        vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
+        vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa1), 2);
+        vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa1), 2);
+        vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2);
+        vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2);
+        const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb01234567c3 = vmovl_s8(vb01234567c3);
+
+        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
+        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
+        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
+        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
+        const int8x8_t vb89ABCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb89ABCDEFc3 = vmovl_s8(vb89ABCDEFc3);
+
+        vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);
+        vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);
+        vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa1), 3);
+        vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa1), 3);
+        vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3);
+        vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3);
+
+        const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb01234567c4 = vmovl_s8(vb01234567c4);
+
+        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
+        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
+        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
+        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
+        const int8x8_t vb89ABCDEFc4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb89ABCDEFc4 = vmovl_s8(vb89ABCDEFc4);
+
+        vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
+        vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
+        vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa1), 0);
+        vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa1), 0);
+        vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0);
+        vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0);
+        const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb01234567c5 = vmovl_s8(vb01234567c5);
+
+        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
+        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
+        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
+        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
+        const int8x8_t vb89ABCDEFc5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb89ABCDEFc5 = vmovl_s8(vb89ABCDEFc5);
+
+        vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
+        vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
+        vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa1), 1);
+        vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa1), 1);
+        vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1);
+        vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1);
+        const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb01234567c6 = vmovl_s8(vb01234567c6);
+
+        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
+        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
+        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
+        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
+        const int8x8_t vb89ABCDEFc6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb89ABCDEFc6 = vmovl_s8(vb89ABCDEFc6);
+
+        vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
+        vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
+        vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa1), 2);
+        vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa1), 2);
+        vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2);
+        vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2);
+        const int8x8_t vb01234567c7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb01234567c7 = vmovl_s8(vb01234567c7);
+
+        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
+        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
+        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa1), 3);
+        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa1), 3);
+        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa2), 3);
+        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa2), 3);
+        const int8x8_t vb89ABCDEFc7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb89ABCDEFc7 = vmovl_s8(vb89ABCDEFc7);
+
+        vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3);
+        vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3);
+        vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa1), 3);
+        vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa1), 3);
+        vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa2), 3);
+        vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa2), 3);
+
+        k -= 8 * sizeof(int8_t);
+      }
+      if XNN_UNLIKELY(k != 0) {
+        const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+        const int16x8_t vxa0 = vmovl_s8(va0);
+        const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+        const int16x8_t vxa1 = vmovl_s8(va1);
+        const int8x8_t va2 = vld1_s8(a2); a2 = (const int8_t*) ((uintptr_t) a2 + k);
+        const int16x8_t vxa2 = vmovl_s8(va2);
+
+        const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb01234567c0 = vmovl_s8(vb01234567c0);
+        const int8x8_t vb89ABCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0);
+
+        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+        vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
+        vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
+        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
+        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
+        vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0);
+        vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0);
+        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
+        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
+        vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0);
+        vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0);
+
+        if (k >= 2 * sizeof(int8_t)) {
+          const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int16x8_t vxb01234567c1 = vmovl_s8(vb01234567c1);
+          const int8x8_t vb89ABCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1);
+
+          vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+          vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+          vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
+          vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
+          vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
+          vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
+          vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1);
+          vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1);
+          vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
+          vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
+          vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1);
+          vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1);
+
+          if (k > 2 * sizeof(int8_t)) {
+            const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            const int16x8_t vxb01234567c2 = vmovl_s8(vb01234567c2);
+            const int8x8_t vb89ABCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            const int16x8_t vxb89ABCDEFc2 = vmovl_s8(vb89ABCDEFc2);
+
+            vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+            vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+            vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
+            vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
+            vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
+            vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
+            vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa1), 2);
+            vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa1), 2);
+            vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
+            vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
+            vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2);
+            vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2);
+
+            if (k >= 4 * sizeof(int8_t)) {
+              const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+              const int16x8_t vxb01234567c3 = vmovl_s8(vb01234567c3);
+              const int8x8_t vb89ABCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+              const int16x8_t vxb89ABCDEFc3 = vmovl_s8(vb89ABCDEFc3);
+
+              vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+              vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+              vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);
+              vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);
+              vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
+              vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
+              vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa1), 3);
+              vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa1), 3);
+              vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
+              vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
+              vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3);
+              vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3);
+
+              if (k > 4 * sizeof(int8_t)) {
+                const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+                const int16x8_t vxb01234567c4 = vmovl_s8(vb01234567c4);
+                const int8x8_t vb89ABCDEFc4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+                const int16x8_t vxb89ABCDEFc4 = vmovl_s8(vb89ABCDEFc4);
+
+                vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+                vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+                vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
+                vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
+                vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
+                vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
+                vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa1), 0);
+                vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa1), 0);
+                vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
+                vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
+                vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0);
+                vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0);
+
+                if (k >= 6 * sizeof(int8_t)) {
+                  const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+                  const int16x8_t vxb01234567c5 = vmovl_s8(vb01234567c5);
+                  const int8x8_t vb89ABCDEFc5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+                  const int16x8_t vxb89ABCDEFc5 = vmovl_s8(vb89ABCDEFc5);
+
+                  vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+                  vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+                  vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
+                  vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
+                  vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
+                  vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
+                  vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa1), 1);
+                  vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa1), 1);
+                  vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
+                  vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
+                  vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1);
+                  vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1);
+
+                  if (k > 6 * sizeof(int8_t)) {
+                    const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+                    const int16x8_t vxb01234567c6 = vmovl_s8(vb01234567c6);
+                    const int8x8_t vb89ABCDEFc6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+                    const int16x8_t vxb89ABCDEFc6 = vmovl_s8(vb89ABCDEFc6);
+
+                    vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+                    vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+                    vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
+                    vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
+                    vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
+                    vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
+                    vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa1), 2);
+                    vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa1), 2);
+                    vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
+                    vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
+                    vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2);
+                    vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2);
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+      p -= 3 * sizeof(void*);
+    } while (p != 0);
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+    vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc1x89AB = vqrdmulhq_s32(vacc1x89AB, vmultiplier);
+    vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+    vacc2x89AB = vqrdmulhq_s32(vacc2x89AB, vmultiplier);
+    vacc2xCDEF = vqrdmulhq_s32(vacc2xCDEF, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+    vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc1x89AB = vsraq_n_s32(vacc1x89AB, vbicq_s32(vacc1x89AB, vzero_shift_mask), 31);
+    vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+    vacc2x89AB = vsraq_n_s32(vacc2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31);
+    vacc2xCDEF = vsraq_n_s32(vacc2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_shift);
+    vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+    vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_shift);
+    vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+    int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
+    int8x16_t vout2x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc2x89ABCDEF);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+    int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
+    int8x16_t vout2x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc2x89ABCDEF));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout2x0123456789ABCDEF = vmaxq_s8(vout2x0123456789ABCDEF, voutput_min);
+    vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
+    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+
+    vout2x0123456789ABCDEF = vminq_s8(vout2x0123456789ABCDEF, voutput_max);
+    vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
+    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+
+    if (nc >= 16) {
+      vst1q_s8(c2 + 0, vout2x0123456789ABCDEF);
+      vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
+      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 16;
+    } else {
+      int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
+      int8x8_t vout2x01234567 = vget_low_s8(vout2x0123456789ABCDEF);
+      if (nc & 8) {
+        vst1_s8(c2, vout2x01234567); c2 += 8;
+        vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
+        vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
+        vout2x01234567 = vget_high_s8(vout2x0123456789ABCDEF);
+        vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
+      }
+      if (nc & 4) {
+        vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1_lane_s8(c2, vout2x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/3x16-minmax-neon-mull-addw-dup.c b/src/qs8-igemm/gen/3x16-minmax-neon-mull-addw-dup.c
new file mode 100644
index 0000000..ddd6d7d
--- /dev/null
+++ b/src/qs8-igemm/gen/3x16-minmax-neon-mull-addw-dup.c
@@ -0,0 +1,564 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/neon-mull-addw-dup.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/gemm.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (3 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc1x0123 = vacc0x0123;
+    int32x4_t vacc1x4567 = vacc0x4567;
+    int32x4_t vacc1x89AB = vacc0x89AB;
+    int32x4_t vacc1xCDEF = vacc0xCDEF;
+    int32x4_t vacc2x0123 = vacc0x0123;
+    int32x4_t vacc2x4567 = vacc0x4567;
+    int32x4_t vacc2x89AB = vacc0x89AB;
+    int32x4_t vacc2xCDEF = vacc0xCDEF;
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      a += 3;
+
+      size_t k = kc;
+      while (k >= 8 * sizeof(int8_t)) {
+        const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+        const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+
+        const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c0));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c0));
+        const int16x8_t vprod1x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va1, 0));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c0));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c0));
+        const int16x8_t vprod2x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va2, 0));
+        vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c0));
+        vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c0));
+        const int8x8_t vb89ABCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va0, 0));
+        vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc0));
+        vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc0));
+        const int16x8_t vprod1x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va1, 0));
+        vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc0));
+        vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc0));
+        const int16x8_t vprod2x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va2, 0));
+        vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc0));
+        vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc0));
+        const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c1));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c1));
+        const int16x8_t vprod1x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va1, 1));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c1));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c1));
+        const int16x8_t vprod2x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va2, 1));
+        vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c1));
+        vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c1));
+        const int8x8_t vb89ABCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va0, 1));
+        vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc1));
+        vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc1));
+        const int16x8_t vprod1x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va1, 1));
+        vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc1));
+        vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc1));
+        const int16x8_t vprod2x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va2, 1));
+        vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc1));
+        vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc1));
+        const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va0, 2));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c2));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c2));
+        const int16x8_t vprod1x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va1, 2));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c2));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c2));
+        const int16x8_t vprod2x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va2, 2));
+        vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c2));
+        vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c2));
+        const int8x8_t vb89ABCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va0, 2));
+        vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc2));
+        vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc2));
+        const int16x8_t vprod1x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va1, 2));
+        vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc2));
+        vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc2));
+        const int16x8_t vprod2x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va2, 2));
+        vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc2));
+        vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc2));
+        const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3));
+        const int16x8_t vprod1x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va1, 3));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c3));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c3));
+        const int16x8_t vprod2x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va2, 3));
+        vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c3));
+        vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c3));
+        const int8x8_t vb89ABCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va0, 3));
+        vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc3));
+        vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc3));
+        const int16x8_t vprod1x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va1, 3));
+        vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc3));
+        vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc3));
+        const int16x8_t vprod2x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va2, 3));
+        vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc3));
+        vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc3));
+        const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va0, 4));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c4));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c4));
+        const int16x8_t vprod1x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va1, 4));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c4));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c4));
+        const int16x8_t vprod2x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va2, 4));
+        vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c4));
+        vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c4));
+        const int8x8_t vb89ABCDEFc4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va0, 4));
+        vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc4));
+        vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc4));
+        const int16x8_t vprod1x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va1, 4));
+        vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc4));
+        vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc4));
+        const int16x8_t vprod2x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va2, 4));
+        vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc4));
+        vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc4));
+        const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va0, 5));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c5));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c5));
+        const int16x8_t vprod1x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va1, 5));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c5));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c5));
+        const int16x8_t vprod2x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va2, 5));
+        vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c5));
+        vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c5));
+        const int8x8_t vb89ABCDEFc5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va0, 5));
+        vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc5));
+        vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc5));
+        const int16x8_t vprod1x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va1, 5));
+        vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc5));
+        vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc5));
+        const int16x8_t vprod2x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va2, 5));
+        vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc5));
+        vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc5));
+        const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va0, 6));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c6));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c6));
+        const int16x8_t vprod1x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va1, 6));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c6));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c6));
+        const int16x8_t vprod2x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va2, 6));
+        vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c6));
+        vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c6));
+        const int8x8_t vb89ABCDEFc6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va0, 6));
+        vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc6));
+        vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc6));
+        const int16x8_t vprod1x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va1, 6));
+        vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc6));
+        vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc6));
+        const int16x8_t vprod2x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va2, 6));
+        vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc6));
+        vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc6));
+        const int8x8_t vb01234567c7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va0, 7));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c7));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c7));
+        const int16x8_t vprod1x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va1, 7));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c7));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c7));
+        const int16x8_t vprod2x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va2, 7));
+        vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c7));
+        vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c7));
+        const int8x8_t vb89ABCDEFc7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x89ABCDEFc7 = vmull_s8(vb89ABCDEFc7, vdup_lane_s8(va0, 7));
+        vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc7));
+        vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc7));
+        const int16x8_t vprod1x89ABCDEFc7 = vmull_s8(vb89ABCDEFc7, vdup_lane_s8(va1, 7));
+        vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc7));
+        vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc7));
+        const int16x8_t vprod2x89ABCDEFc7 = vmull_s8(vb89ABCDEFc7, vdup_lane_s8(va2, 7));
+        vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc7));
+        vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc7));
+
+        k -= 8 * sizeof(int8_t);
+      }
+      if XNN_UNLIKELY(k != 0) {
+        const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+        const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+        const int8x8_t va2 = vld1_s8(a2); a2 = (const int8_t*) ((uintptr_t) a2 + k);
+
+        const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c0));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c0));
+        const int16x8_t vprod0x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va0, 0));
+        vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc0));
+        vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc0));
+        const int16x8_t vprod1x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va1, 0));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c0));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c0));
+        const int16x8_t vprod1x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va1, 0));
+        vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc0));
+        vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc0));
+        const int16x8_t vprod2x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va2, 0));
+        vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c0));
+        vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c0));
+        const int16x8_t vprod2x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va2, 0));
+        vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc0));
+        vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc0));
+
+        if (k >= 2 * sizeof(int8_t)) {
+          const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb89ABCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1));
+          vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c1));
+          vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c1));
+          const int16x8_t vprod0x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va0, 1));
+          vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc1));
+          vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc1));
+          const int16x8_t vprod1x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va1, 1));
+          vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c1));
+          vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c1));
+          const int16x8_t vprod1x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va1, 1));
+          vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc1));
+          vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc1));
+          const int16x8_t vprod2x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va2, 1));
+          vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c1));
+          vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c1));
+          const int16x8_t vprod2x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va2, 1));
+          vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc1));
+          vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc1));
+
+          if (k > 2 * sizeof(int8_t)) {
+            const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            const int8x8_t vb89ABCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+            const int16x8_t vprod0x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va0, 2));
+            vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c2));
+            vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c2));
+            const int16x8_t vprod0x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va0, 2));
+            vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc2));
+            vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc2));
+            const int16x8_t vprod1x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va1, 2));
+            vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c2));
+            vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c2));
+            const int16x8_t vprod1x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va1, 2));
+            vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc2));
+            vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc2));
+            const int16x8_t vprod2x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va2, 2));
+            vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c2));
+            vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c2));
+            const int16x8_t vprod2x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va2, 2));
+            vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc2));
+            vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc2));
+
+            if (k >= 4 * sizeof(int8_t)) {
+              const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+              const int8x8_t vb89ABCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+              const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3));
+              vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3));
+              vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3));
+              const int16x8_t vprod0x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va0, 3));
+              vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc3));
+              vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc3));
+              const int16x8_t vprod1x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va1, 3));
+              vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c3));
+              vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c3));
+              const int16x8_t vprod1x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va1, 3));
+              vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc3));
+              vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc3));
+              const int16x8_t vprod2x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va2, 3));
+              vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c3));
+              vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c3));
+              const int16x8_t vprod2x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va2, 3));
+              vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc3));
+              vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc3));
+
+              if (k > 4 * sizeof(int8_t)) {
+                const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+                const int8x8_t vb89ABCDEFc4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+                const int16x8_t vprod0x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va0, 4));
+                vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c4));
+                vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c4));
+                const int16x8_t vprod0x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va0, 4));
+                vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc4));
+                vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc4));
+                const int16x8_t vprod1x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va1, 4));
+                vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c4));
+                vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c4));
+                const int16x8_t vprod1x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va1, 4));
+                vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc4));
+                vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc4));
+                const int16x8_t vprod2x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va2, 4));
+                vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c4));
+                vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c4));
+                const int16x8_t vprod2x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va2, 4));
+                vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc4));
+                vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc4));
+
+                if (k >= 6 * sizeof(int8_t)) {
+                  const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+                  const int8x8_t vb89ABCDEFc5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+                  const int16x8_t vprod0x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va0, 5));
+                  vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c5));
+                  vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c5));
+                  const int16x8_t vprod0x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va0, 5));
+                  vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc5));
+                  vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc5));
+                  const int16x8_t vprod1x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va1, 5));
+                  vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c5));
+                  vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c5));
+                  const int16x8_t vprod1x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va1, 5));
+                  vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc5));
+                  vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc5));
+                  const int16x8_t vprod2x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va2, 5));
+                  vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c5));
+                  vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c5));
+                  const int16x8_t vprod2x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va2, 5));
+                  vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc5));
+                  vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc5));
+
+                  if (k > 6 * sizeof(int8_t)) {
+                    const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+                    const int8x8_t vb89ABCDEFc6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+                    const int16x8_t vprod0x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va0, 6));
+                    vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c6));
+                    vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c6));
+                    const int16x8_t vprod0x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va0, 6));
+                    vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc6));
+                    vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc6));
+                    const int16x8_t vprod1x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va1, 6));
+                    vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c6));
+                    vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c6));
+                    const int16x8_t vprod1x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va1, 6));
+                    vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc6));
+                    vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc6));
+                    const int16x8_t vprod2x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va2, 6));
+                    vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c6));
+                    vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c6));
+                    const int16x8_t vprod2x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va2, 6));
+                    vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc6));
+                    vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc6));
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+      p -= 3 * sizeof(void*);
+    } while (p != 0);
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+    vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc1x89AB = vqrdmulhq_s32(vacc1x89AB, vmultiplier);
+    vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+    vacc2x89AB = vqrdmulhq_s32(vacc2x89AB, vmultiplier);
+    vacc2xCDEF = vqrdmulhq_s32(vacc2xCDEF, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+    vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc1x89AB = vsraq_n_s32(vacc1x89AB, vbicq_s32(vacc1x89AB, vzero_shift_mask), 31);
+    vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+    vacc2x89AB = vsraq_n_s32(vacc2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31);
+    vacc2xCDEF = vsraq_n_s32(vacc2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_shift);
+    vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+    vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_shift);
+    vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+    int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
+    int8x16_t vout2x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc2x89ABCDEF);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+    int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
+    int8x16_t vout2x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc2x89ABCDEF));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout2x0123456789ABCDEF = vmaxq_s8(vout2x0123456789ABCDEF, voutput_min);
+    vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
+    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+
+    vout2x0123456789ABCDEF = vminq_s8(vout2x0123456789ABCDEF, voutput_max);
+    vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
+    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+
+    if (nc >= 16) {
+      vst1q_s8(c2 + 0, vout2x0123456789ABCDEF);
+      vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
+      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 16;
+    } else {
+      int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
+      int8x8_t vout2x01234567 = vget_low_s8(vout2x0123456789ABCDEF);
+      if (nc & 8) {
+        vst1_s8(c2, vout2x01234567); c2 += 8;
+        vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
+        vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
+        vout2x01234567 = vget_high_s8(vout2x0123456789ABCDEF);
+        vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
+      }
+      if (nc & 4) {
+        vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1_lane_s8(c2, vout2x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/3x16c16-minmax-neon-mlal-padal.c b/src/qs8-igemm/gen/3x16c16-minmax-neon-mlal-padal.c
new file mode 100644
index 0000000..2b9bfae
--- /dev/null
+++ b/src/qs8-igemm/gen/3x16c16-minmax-neon-mlal-padal.c
@@ -0,0 +1,540 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/c16-neon-mlal-padal.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (3 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 16);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+
+  do {
+    int32x4_t vacc0x0 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x1 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x2 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x3 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x4 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x5 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x8 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x9 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x10 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x11 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x12 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x13 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x14 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x15 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc1x0 = vacc0x0;
+    int32x4_t vacc1x1 = vacc0x1;
+    int32x4_t vacc1x2 = vacc0x2;
+    int32x4_t vacc1x3 = vacc0x3;
+    int32x4_t vacc1x4 = vacc0x4;
+    int32x4_t vacc1x5 = vacc0x5;
+    int32x4_t vacc1x6 = vacc0x6;
+    int32x4_t vacc1x7 = vacc0x7;
+    int32x4_t vacc1x8 = vacc0x8;
+    int32x4_t vacc1x9 = vacc0x9;
+    int32x4_t vacc1x10 = vacc0x10;
+    int32x4_t vacc1x11 = vacc0x11;
+    int32x4_t vacc1x12 = vacc0x12;
+    int32x4_t vacc1x13 = vacc0x13;
+    int32x4_t vacc1x14 = vacc0x14;
+    int32x4_t vacc1x15 = vacc0x15;
+    int32x4_t vacc2x0 = vacc0x0;
+    int32x4_t vacc2x1 = vacc0x1;
+    int32x4_t vacc2x2 = vacc0x2;
+    int32x4_t vacc2x3 = vacc0x3;
+    int32x4_t vacc2x4 = vacc0x4;
+    int32x4_t vacc2x5 = vacc0x5;
+    int32x4_t vacc2x6 = vacc0x6;
+    int32x4_t vacc2x7 = vacc0x7;
+    int32x4_t vacc2x8 = vacc0x8;
+    int32x4_t vacc2x9 = vacc0x9;
+    int32x4_t vacc2x10 = vacc0x10;
+    int32x4_t vacc2x11 = vacc0x11;
+    int32x4_t vacc2x12 = vacc0x12;
+    int32x4_t vacc2x13 = vacc0x13;
+    int32x4_t vacc2x14 = vacc0x14;
+    int32x4_t vacc2x15 = vacc0x15;
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      a += 3;
+
+      // KC loop of 16 with up to 15 remainder
+      size_t k = 0;
+      while (k < kc) {
+        const int8x16_t va0 = vld1q_s8(a0); a0 += 16;
+        const int8x16_t va1 = vld1q_s8(a1); a1 += 16;
+        const int8x16_t va2 = vld1q_s8(a2); a2 += 16;
+
+        const int8x16_t vb0 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb1 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb2 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb3 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb4 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb5 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb6 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb7 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb8 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb9 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb10 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb11 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb12 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb13 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb14 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb15 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+
+        int16x8_t vprod0x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va0));
+        int16x8_t vprod1x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va1));
+        int16x8_t vprod2x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va2));
+        vprod0x0 = vmlal_s8(vprod0x0, vget_high_s8(vb0), vget_high_s8(va0));
+        vprod1x0 = vmlal_s8(vprod1x0, vget_high_s8(vb0), vget_high_s8(va1));
+        vprod2x0 = vmlal_s8(vprod2x0, vget_high_s8(vb0), vget_high_s8(va2));
+        vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+        vacc1x0 = vpadalq_s16(vacc1x0, vprod1x0);
+        vacc2x0 = vpadalq_s16(vacc2x0, vprod2x0);
+        int16x8_t vprod0x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va0));
+        int16x8_t vprod1x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va1));
+        int16x8_t vprod2x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va2));
+        vprod0x1 = vmlal_s8(vprod0x1, vget_high_s8(vb1), vget_high_s8(va0));
+        vprod1x1 = vmlal_s8(vprod1x1, vget_high_s8(vb1), vget_high_s8(va1));
+        vprod2x1 = vmlal_s8(vprod2x1, vget_high_s8(vb1), vget_high_s8(va2));
+        vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+        vacc1x1 = vpadalq_s16(vacc1x1, vprod1x1);
+        vacc2x1 = vpadalq_s16(vacc2x1, vprod2x1);
+        int16x8_t vprod0x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va0));
+        int16x8_t vprod1x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va1));
+        int16x8_t vprod2x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va2));
+        vprod0x2 = vmlal_s8(vprod0x2, vget_high_s8(vb2), vget_high_s8(va0));
+        vprod1x2 = vmlal_s8(vprod1x2, vget_high_s8(vb2), vget_high_s8(va1));
+        vprod2x2 = vmlal_s8(vprod2x2, vget_high_s8(vb2), vget_high_s8(va2));
+        vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+        vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2);
+        vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2);
+        int16x8_t vprod0x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va0));
+        int16x8_t vprod1x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va1));
+        int16x8_t vprod2x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va2));
+        vprod0x3 = vmlal_s8(vprod0x3, vget_high_s8(vb3), vget_high_s8(va0));
+        vprod1x3 = vmlal_s8(vprod1x3, vget_high_s8(vb3), vget_high_s8(va1));
+        vprod2x3 = vmlal_s8(vprod2x3, vget_high_s8(vb3), vget_high_s8(va2));
+        vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+        vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3);
+        vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3);
+        int16x8_t vprod0x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va0));
+        int16x8_t vprod1x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va1));
+        int16x8_t vprod2x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va2));
+        vprod0x4 = vmlal_s8(vprod0x4, vget_high_s8(vb4), vget_high_s8(va0));
+        vprod1x4 = vmlal_s8(vprod1x4, vget_high_s8(vb4), vget_high_s8(va1));
+        vprod2x4 = vmlal_s8(vprod2x4, vget_high_s8(vb4), vget_high_s8(va2));
+        vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+        vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4);
+        vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4);
+        int16x8_t vprod0x5 = vmull_s8(vget_low_s8(vb5), vget_low_s8(va0));
+        int16x8_t vprod1x5 = vmull_s8(vget_low_s8(vb5), vget_low_s8(va1));
+        int16x8_t vprod2x5 = vmull_s8(vget_low_s8(vb5), vget_low_s8(va2));
+        vprod0x5 = vmlal_s8(vprod0x5, vget_high_s8(vb5), vget_high_s8(va0));
+        vprod1x5 = vmlal_s8(vprod1x5, vget_high_s8(vb5), vget_high_s8(va1));
+        vprod2x5 = vmlal_s8(vprod2x5, vget_high_s8(vb5), vget_high_s8(va2));
+        vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+        vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5);
+        vacc2x5 = vpadalq_s16(vacc2x5, vprod2x5);
+        int16x8_t vprod0x6 = vmull_s8(vget_low_s8(vb6), vget_low_s8(va0));
+        int16x8_t vprod1x6 = vmull_s8(vget_low_s8(vb6), vget_low_s8(va1));
+        int16x8_t vprod2x6 = vmull_s8(vget_low_s8(vb6), vget_low_s8(va2));
+        vprod0x6 = vmlal_s8(vprod0x6, vget_high_s8(vb6), vget_high_s8(va0));
+        vprod1x6 = vmlal_s8(vprod1x6, vget_high_s8(vb6), vget_high_s8(va1));
+        vprod2x6 = vmlal_s8(vprod2x6, vget_high_s8(vb6), vget_high_s8(va2));
+        vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+        vacc1x6 = vpadalq_s16(vacc1x6, vprod1x6);
+        vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6);
+        int16x8_t vprod0x7 = vmull_s8(vget_low_s8(vb7), vget_low_s8(va0));
+        int16x8_t vprod1x7 = vmull_s8(vget_low_s8(vb7), vget_low_s8(va1));
+        int16x8_t vprod2x7 = vmull_s8(vget_low_s8(vb7), vget_low_s8(va2));
+        vprod0x7 = vmlal_s8(vprod0x7, vget_high_s8(vb7), vget_high_s8(va0));
+        vprod1x7 = vmlal_s8(vprod1x7, vget_high_s8(vb7), vget_high_s8(va1));
+        vprod2x7 = vmlal_s8(vprod2x7, vget_high_s8(vb7), vget_high_s8(va2));
+        vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+        vacc1x7 = vpadalq_s16(vacc1x7, vprod1x7);
+        vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7);
+        int16x8_t vprod0x8 = vmull_s8(vget_low_s8(vb8), vget_low_s8(va0));
+        int16x8_t vprod1x8 = vmull_s8(vget_low_s8(vb8), vget_low_s8(va1));
+        int16x8_t vprod2x8 = vmull_s8(vget_low_s8(vb8), vget_low_s8(va2));
+        vprod0x8 = vmlal_s8(vprod0x8, vget_high_s8(vb8), vget_high_s8(va0));
+        vprod1x8 = vmlal_s8(vprod1x8, vget_high_s8(vb8), vget_high_s8(va1));
+        vprod2x8 = vmlal_s8(vprod2x8, vget_high_s8(vb8), vget_high_s8(va2));
+        vacc0x8 = vpadalq_s16(vacc0x8, vprod0x8);
+        vacc1x8 = vpadalq_s16(vacc1x8, vprod1x8);
+        vacc2x8 = vpadalq_s16(vacc2x8, vprod2x8);
+        int16x8_t vprod0x9 = vmull_s8(vget_low_s8(vb9), vget_low_s8(va0));
+        int16x8_t vprod1x9 = vmull_s8(vget_low_s8(vb9), vget_low_s8(va1));
+        int16x8_t vprod2x9 = vmull_s8(vget_low_s8(vb9), vget_low_s8(va2));
+        vprod0x9 = vmlal_s8(vprod0x9, vget_high_s8(vb9), vget_high_s8(va0));
+        vprod1x9 = vmlal_s8(vprod1x9, vget_high_s8(vb9), vget_high_s8(va1));
+        vprod2x9 = vmlal_s8(vprod2x9, vget_high_s8(vb9), vget_high_s8(va2));
+        vacc0x9 = vpadalq_s16(vacc0x9, vprod0x9);
+        vacc1x9 = vpadalq_s16(vacc1x9, vprod1x9);
+        vacc2x9 = vpadalq_s16(vacc2x9, vprod2x9);
+        int16x8_t vprod0x10 = vmull_s8(vget_low_s8(vb10), vget_low_s8(va0));
+        int16x8_t vprod1x10 = vmull_s8(vget_low_s8(vb10), vget_low_s8(va1));
+        int16x8_t vprod2x10 = vmull_s8(vget_low_s8(vb10), vget_low_s8(va2));
+        vprod0x10 = vmlal_s8(vprod0x10, vget_high_s8(vb10), vget_high_s8(va0));
+        vprod1x10 = vmlal_s8(vprod1x10, vget_high_s8(vb10), vget_high_s8(va1));
+        vprod2x10 = vmlal_s8(vprod2x10, vget_high_s8(vb10), vget_high_s8(va2));
+        vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10);
+        vacc1x10 = vpadalq_s16(vacc1x10, vprod1x10);
+        vacc2x10 = vpadalq_s16(vacc2x10, vprod2x10);
+        int16x8_t vprod0x11 = vmull_s8(vget_low_s8(vb11), vget_low_s8(va0));
+        int16x8_t vprod1x11 = vmull_s8(vget_low_s8(vb11), vget_low_s8(va1));
+        int16x8_t vprod2x11 = vmull_s8(vget_low_s8(vb11), vget_low_s8(va2));
+        vprod0x11 = vmlal_s8(vprod0x11, vget_high_s8(vb11), vget_high_s8(va0));
+        vprod1x11 = vmlal_s8(vprod1x11, vget_high_s8(vb11), vget_high_s8(va1));
+        vprod2x11 = vmlal_s8(vprod2x11, vget_high_s8(vb11), vget_high_s8(va2));
+        vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11);
+        vacc1x11 = vpadalq_s16(vacc1x11, vprod1x11);
+        vacc2x11 = vpadalq_s16(vacc2x11, vprod2x11);
+        int16x8_t vprod0x12 = vmull_s8(vget_low_s8(vb12), vget_low_s8(va0));
+        int16x8_t vprod1x12 = vmull_s8(vget_low_s8(vb12), vget_low_s8(va1));
+        int16x8_t vprod2x12 = vmull_s8(vget_low_s8(vb12), vget_low_s8(va2));
+        vprod0x12 = vmlal_s8(vprod0x12, vget_high_s8(vb12), vget_high_s8(va0));
+        vprod1x12 = vmlal_s8(vprod1x12, vget_high_s8(vb12), vget_high_s8(va1));
+        vprod2x12 = vmlal_s8(vprod2x12, vget_high_s8(vb12), vget_high_s8(va2));
+        vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12);
+        vacc1x12 = vpadalq_s16(vacc1x12, vprod1x12);
+        vacc2x12 = vpadalq_s16(vacc2x12, vprod2x12);
+        int16x8_t vprod0x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va0));
+        int16x8_t vprod1x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va1));
+        int16x8_t vprod2x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va2));
+        vprod0x13 = vmlal_s8(vprod0x13, vget_high_s8(vb13), vget_high_s8(va0));
+        vprod1x13 = vmlal_s8(vprod1x13, vget_high_s8(vb13), vget_high_s8(va1));
+        vprod2x13 = vmlal_s8(vprod2x13, vget_high_s8(vb13), vget_high_s8(va2));
+        vacc0x13 = vpadalq_s16(vacc0x13, vprod0x13);
+        vacc1x13 = vpadalq_s16(vacc1x13, vprod1x13);
+        vacc2x13 = vpadalq_s16(vacc2x13, vprod2x13);
+        int16x8_t vprod0x14 = vmull_s8(vget_low_s8(vb14), vget_low_s8(va0));
+        int16x8_t vprod1x14 = vmull_s8(vget_low_s8(vb14), vget_low_s8(va1));
+        int16x8_t vprod2x14 = vmull_s8(vget_low_s8(vb14), vget_low_s8(va2));
+        vprod0x14 = vmlal_s8(vprod0x14, vget_high_s8(vb14), vget_high_s8(va0));
+        vprod1x14 = vmlal_s8(vprod1x14, vget_high_s8(vb14), vget_high_s8(va1));
+        vprod2x14 = vmlal_s8(vprod2x14, vget_high_s8(vb14), vget_high_s8(va2));
+        vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14);
+        vacc1x14 = vpadalq_s16(vacc1x14, vprod1x14);
+        vacc2x14 = vpadalq_s16(vacc2x14, vprod2x14);
+        int16x8_t vprod0x15 = vmull_s8(vget_low_s8(vb15), vget_low_s8(va0));
+        int16x8_t vprod1x15 = vmull_s8(vget_low_s8(vb15), vget_low_s8(va1));
+        int16x8_t vprod2x15 = vmull_s8(vget_low_s8(vb15), vget_low_s8(va2));
+        vprod0x15 = vmlal_s8(vprod0x15, vget_high_s8(vb15), vget_high_s8(va0));
+        vprod1x15 = vmlal_s8(vprod1x15, vget_high_s8(vb15), vget_high_s8(va1));
+        vprod2x15 = vmlal_s8(vprod2x15, vget_high_s8(vb15), vget_high_s8(va2));
+        vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15);
+        vacc1x15 = vpadalq_s16(vacc1x15, vprod1x15);
+        vacc2x15 = vpadalq_s16(vacc2x15, vprod2x15);
+
+        k += 16 * sizeof(int8_t);
+      }
+
+      p -= 3 * sizeof(void*);
+    } while (p != 0);
+
+#if XNN_ARCH_ARM64
+    const int32x4_t vsum0x01 = vpaddq_s32(vacc0x0, vacc0x1);
+    const int32x4_t vsum0x23 = vpaddq_s32(vacc0x2, vacc0x3);
+    const int32x4_t vsum0x45 = vpaddq_s32(vacc0x4, vacc0x5);
+    const int32x4_t vsum0x67 = vpaddq_s32(vacc0x6, vacc0x7);
+    const int32x4_t vsum0x89 = vpaddq_s32(vacc0x8, vacc0x9);
+    const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11);
+    const int32x4_t vsum0xCD = vpaddq_s32(vacc0x12, vacc0x13);
+    const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15);
+    const int32x4_t vsum1x01 = vpaddq_s32(vacc1x0, vacc1x1);
+    const int32x4_t vsum1x23 = vpaddq_s32(vacc1x2, vacc1x3);
+    const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5);
+    const int32x4_t vsum1x67 = vpaddq_s32(vacc1x6, vacc1x7);
+    const int32x4_t vsum1x89 = vpaddq_s32(vacc1x8, vacc1x9);
+    const int32x4_t vsum1xAB = vpaddq_s32(vacc1x10, vacc1x11);
+    const int32x4_t vsum1xCD = vpaddq_s32(vacc1x12, vacc1x13);
+    const int32x4_t vsum1xEF = vpaddq_s32(vacc1x14, vacc1x15);
+    const int32x4_t vsum2x01 = vpaddq_s32(vacc2x0, vacc2x1);
+    const int32x4_t vsum2x23 = vpaddq_s32(vacc2x2, vacc2x3);
+    const int32x4_t vsum2x45 = vpaddq_s32(vacc2x4, vacc2x5);
+    const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7);
+    const int32x4_t vsum2x89 = vpaddq_s32(vacc2x8, vacc2x9);
+    const int32x4_t vsum2xAB = vpaddq_s32(vacc2x10, vacc2x11);
+    const int32x4_t vsum2xCD = vpaddq_s32(vacc2x12, vacc2x13);
+    const int32x4_t vsum2xEF = vpaddq_s32(vacc2x14, vacc2x15);
+    int32x4_t vacc0x0123 = vpaddq_s32(vsum0x01, vsum0x23);
+    int32x4_t vacc0x4567 = vpaddq_s32(vsum0x45, vsum0x67);
+    int32x4_t vacc0x89AB = vpaddq_s32(vsum0x89, vsum0xAB);
+    int32x4_t vacc0xCDEF = vpaddq_s32(vsum0xCD, vsum0xEF);
+    int32x4_t vacc1x0123 = vpaddq_s32(vsum1x01, vsum1x23);
+    int32x4_t vacc1x4567 = vpaddq_s32(vsum1x45, vsum1x67);
+    int32x4_t vacc1x89AB = vpaddq_s32(vsum1x89, vsum1xAB);
+    int32x4_t vacc1xCDEF = vpaddq_s32(vsum1xCD, vsum1xEF);
+    int32x4_t vacc2x0123 = vpaddq_s32(vsum2x01, vsum2x23);
+    int32x4_t vacc2x4567 = vpaddq_s32(vsum2x45, vsum2x67);
+    int32x4_t vacc2x89AB = vpaddq_s32(vsum2x89, vsum2xAB);
+    int32x4_t vacc2xCDEF = vpaddq_s32(vsum2xCD, vsum2xEF);
+#else
+    const int32x2_t vpsum0x0 = vadd_s32(vget_low_s32(vacc0x0), vget_high_s32(vacc0x0));
+    const int32x2_t vpsum0x1 = vadd_s32(vget_low_s32(vacc0x1), vget_high_s32(vacc0x1));
+    const int32x2_t vpsum0x2 = vadd_s32(vget_low_s32(vacc0x2), vget_high_s32(vacc0x2));
+    const int32x2_t vpsum0x3 = vadd_s32(vget_low_s32(vacc0x3), vget_high_s32(vacc0x3));
+    const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1);
+    const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3);
+    int32x4_t vacc0x0123 = vcombine_s32(vsum0x01, vsum0x23 );
+    const int32x2_t vpsum0x4 = vadd_s32(vget_low_s32(vacc0x4), vget_high_s32(vacc0x4));
+    const int32x2_t vpsum0x5 = vadd_s32(vget_low_s32(vacc0x5), vget_high_s32(vacc0x5));
+    const int32x2_t vpsum0x6 = vadd_s32(vget_low_s32(vacc0x6), vget_high_s32(vacc0x6));
+    const int32x2_t vpsum0x7 = vadd_s32(vget_low_s32(vacc0x7), vget_high_s32(vacc0x7));
+    const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5);
+    const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7);
+    int32x4_t vacc0x4567 = vcombine_s32(vsum0x45, vsum0x67 );
+    const int32x2_t vpsum0x8 = vadd_s32(vget_low_s32(vacc0x8), vget_high_s32(vacc0x8));
+    const int32x2_t vpsum0x9 = vadd_s32(vget_low_s32(vacc0x9), vget_high_s32(vacc0x9));
+    const int32x2_t vpsum0xA = vadd_s32(vget_low_s32(vacc0x10), vget_high_s32(vacc0x10));
+    const int32x2_t vpsum0xB = vadd_s32(vget_low_s32(vacc0x11), vget_high_s32(vacc0x11));
+    const int32x2_t vsum0x89 = vpadd_s32(vpsum0x8, vpsum0x9);
+    const int32x2_t vsum0xAB = vpadd_s32(vpsum0xA, vpsum0xB);
+    int32x4_t vacc0x89AB = vcombine_s32(vsum0x89, vsum0xAB );
+    const int32x2_t vpsum0xC = vadd_s32(vget_low_s32(vacc0x12), vget_high_s32(vacc0x12));
+    const int32x2_t vpsum0xD = vadd_s32(vget_low_s32(vacc0x13), vget_high_s32(vacc0x13));
+    const int32x2_t vpsum0xE = vadd_s32(vget_low_s32(vacc0x14), vget_high_s32(vacc0x14));
+    const int32x2_t vpsum0xF = vadd_s32(vget_low_s32(vacc0x15), vget_high_s32(vacc0x15));
+    const int32x2_t vsum0xCD = vpadd_s32(vpsum0xC, vpsum0xD);
+    const int32x2_t vsum0xEF = vpadd_s32(vpsum0xE, vpsum0xF);
+    int32x4_t vacc0xCDEF = vcombine_s32(vsum0xCD, vsum0xEF );
+    const int32x2_t vpsum1x0 = vadd_s32(vget_low_s32(vacc1x0), vget_high_s32(vacc1x0));
+    const int32x2_t vpsum1x1 = vadd_s32(vget_low_s32(vacc1x1), vget_high_s32(vacc1x1));
+    const int32x2_t vpsum1x2 = vadd_s32(vget_low_s32(vacc1x2), vget_high_s32(vacc1x2));
+    const int32x2_t vpsum1x3 = vadd_s32(vget_low_s32(vacc1x3), vget_high_s32(vacc1x3));
+    const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1);
+    const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3);
+    int32x4_t vacc1x0123 = vcombine_s32(vsum1x01, vsum1x23 );
+    const int32x2_t vpsum1x4 = vadd_s32(vget_low_s32(vacc1x4), vget_high_s32(vacc1x4));
+    const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5));
+    const int32x2_t vpsum1x6 = vadd_s32(vget_low_s32(vacc1x6), vget_high_s32(vacc1x6));
+    const int32x2_t vpsum1x7 = vadd_s32(vget_low_s32(vacc1x7), vget_high_s32(vacc1x7));
+    const int32x2_t vsum1x45 = vpadd_s32(vpsum1x4, vpsum1x5);
+    const int32x2_t vsum1x67 = vpadd_s32(vpsum1x6, vpsum1x7);
+    int32x4_t vacc1x4567 = vcombine_s32(vsum1x45, vsum1x67 );
+    const int32x2_t vpsum1x8 = vadd_s32(vget_low_s32(vacc1x8), vget_high_s32(vacc1x8));
+    const int32x2_t vpsum1x9 = vadd_s32(vget_low_s32(vacc1x9), vget_high_s32(vacc1x9));
+    const int32x2_t vpsum1xA = vadd_s32(vget_low_s32(vacc1x10), vget_high_s32(vacc1x10));
+    const int32x2_t vpsum1xB = vadd_s32(vget_low_s32(vacc1x11), vget_high_s32(vacc1x11));
+    const int32x2_t vsum1x89 = vpadd_s32(vpsum1x8, vpsum1x9);
+    const int32x2_t vsum1xAB = vpadd_s32(vpsum1xA, vpsum1xB);
+    int32x4_t vacc1x89AB = vcombine_s32(vsum1x89, vsum1xAB );
+    const int32x2_t vpsum1xC = vadd_s32(vget_low_s32(vacc1x12), vget_high_s32(vacc1x12));
+    const int32x2_t vpsum1xD = vadd_s32(vget_low_s32(vacc1x13), vget_high_s32(vacc1x13));
+    const int32x2_t vpsum1xE = vadd_s32(vget_low_s32(vacc1x14), vget_high_s32(vacc1x14));
+    const int32x2_t vpsum1xF = vadd_s32(vget_low_s32(vacc1x15), vget_high_s32(vacc1x15));
+    const int32x2_t vsum1xCD = vpadd_s32(vpsum1xC, vpsum1xD);
+    const int32x2_t vsum1xEF = vpadd_s32(vpsum1xE, vpsum1xF);
+    int32x4_t vacc1xCDEF = vcombine_s32(vsum1xCD, vsum1xEF );
+    const int32x2_t vpsum2x0 = vadd_s32(vget_low_s32(vacc2x0), vget_high_s32(vacc2x0));
+    const int32x2_t vpsum2x1 = vadd_s32(vget_low_s32(vacc2x1), vget_high_s32(vacc2x1));
+    const int32x2_t vpsum2x2 = vadd_s32(vget_low_s32(vacc2x2), vget_high_s32(vacc2x2));
+    const int32x2_t vpsum2x3 = vadd_s32(vget_low_s32(vacc2x3), vget_high_s32(vacc2x3));
+    const int32x2_t vsum2x01 = vpadd_s32(vpsum2x0, vpsum2x1);
+    const int32x2_t vsum2x23 = vpadd_s32(vpsum2x2, vpsum2x3);
+    int32x4_t vacc2x0123 = vcombine_s32(vsum2x01, vsum2x23 );
+    const int32x2_t vpsum2x4 = vadd_s32(vget_low_s32(vacc2x4), vget_high_s32(vacc2x4));
+    const int32x2_t vpsum2x5 = vadd_s32(vget_low_s32(vacc2x5), vget_high_s32(vacc2x5));
+    const int32x2_t vpsum2x6 = vadd_s32(vget_low_s32(vacc2x6), vget_high_s32(vacc2x6));
+    const int32x2_t vpsum2x7 = vadd_s32(vget_low_s32(vacc2x7), vget_high_s32(vacc2x7));
+    const int32x2_t vsum2x45 = vpadd_s32(vpsum2x4, vpsum2x5);
+    const int32x2_t vsum2x67 = vpadd_s32(vpsum2x6, vpsum2x7);
+    int32x4_t vacc2x4567 = vcombine_s32(vsum2x45, vsum2x67 );
+    const int32x2_t vpsum2x8 = vadd_s32(vget_low_s32(vacc2x8), vget_high_s32(vacc2x8));
+    const int32x2_t vpsum2x9 = vadd_s32(vget_low_s32(vacc2x9), vget_high_s32(vacc2x9));
+    const int32x2_t vpsum2xA = vadd_s32(vget_low_s32(vacc2x10), vget_high_s32(vacc2x10));
+    const int32x2_t vpsum2xB = vadd_s32(vget_low_s32(vacc2x11), vget_high_s32(vacc2x11));
+    const int32x2_t vsum2x89 = vpadd_s32(vpsum2x8, vpsum2x9);
+    const int32x2_t vsum2xAB = vpadd_s32(vpsum2xA, vpsum2xB);
+    int32x4_t vacc2x89AB = vcombine_s32(vsum2x89, vsum2xAB );
+    const int32x2_t vpsum2xC = vadd_s32(vget_low_s32(vacc2x12), vget_high_s32(vacc2x12));
+    const int32x2_t vpsum2xD = vadd_s32(vget_low_s32(vacc2x13), vget_high_s32(vacc2x13));
+    const int32x2_t vpsum2xE = vadd_s32(vget_low_s32(vacc2x14), vget_high_s32(vacc2x14));
+    const int32x2_t vpsum2xF = vadd_s32(vget_low_s32(vacc2x15), vget_high_s32(vacc2x15));
+    const int32x2_t vsum2xCD = vpadd_s32(vpsum2xC, vpsum2xD);
+    const int32x2_t vsum2xEF = vpadd_s32(vpsum2xE, vpsum2xF);
+    int32x4_t vacc2xCDEF = vcombine_s32(vsum2xCD, vsum2xEF );
+#endif
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+    vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc1x89AB = vqrdmulhq_s32(vacc1x89AB, vmultiplier);
+    vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+    vacc2x89AB = vqrdmulhq_s32(vacc2x89AB, vmultiplier);
+    vacc2xCDEF = vqrdmulhq_s32(vacc2xCDEF, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+    vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc1x89AB = vsraq_n_s32(vacc1x89AB, vbicq_s32(vacc1x89AB, vzero_shift_mask), 31);
+    vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+    vacc2x89AB = vsraq_n_s32(vacc2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31);
+    vacc2xCDEF = vsraq_n_s32(vacc2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_shift);
+    vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+    vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_shift);
+    vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), voutput_zero_point);
+    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+    int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
+    int8x16_t vout2x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc2x89ABCDEF);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+    int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
+    int8x16_t vout2x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc2x89ABCDEF));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout2x0123456789ABCDEF = vmaxq_s8(vout2x0123456789ABCDEF, voutput_min);
+    vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
+    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+
+    vout2x0123456789ABCDEF = vminq_s8(vout2x0123456789ABCDEF, voutput_max);
+    vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
+    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+
+    if (nc >= 16) {
+      vst1q_s8(c2 + 0, vout2x0123456789ABCDEF);
+      vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
+      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 16;
+    } else {
+      int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
+      int8x8_t vout2x01234567 = vget_low_s8(vout2x0123456789ABCDEF);
+      if (nc & 8) {
+        vst1_s8(c2, vout2x01234567); c2 += 8;
+        vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
+        vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
+        vout2x01234567 = vget_high_s8(vout2x0123456789ABCDEF);
+        vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
+      }
+      if (nc & 4) {
+        vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1_lane_s8(c2, vout2x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/3x16c2-minmax-neon-mlal-padal-dup.c b/src/qs8-igemm/gen/3x16c2-minmax-neon-mlal-padal-dup.c
new file mode 100644
index 0000000..5fa8f78
--- /dev/null
+++ b/src/qs8-igemm/gen/3x16c2-minmax-neon-mlal-padal-dup.c
@@ -0,0 +1,621 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/c2-neon-mull-padal-dup.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (3 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc1x0123 = vacc0x0123;
+    int32x4_t vacc1x4567 = vacc0x4567;
+    int32x4_t vacc1x89AB = vacc0x89AB;
+    int32x4_t vacc1xCDEF = vacc0xCDEF;
+    int32x4_t vacc2x0123 = vacc0x0123;
+    int32x4_t vacc2x4567 = vacc0x4567;
+    int32x4_t vacc2x89AB = vacc0x89AB;
+    int32x4_t vacc2xCDEF = vacc0xCDEF;
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      a += 3;
+
+      size_t k = kc;
+
+      while (k >= 16 * sizeof(int8_t)) {
+        const int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va0x1 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va1x0 = vld1_s8(a1); a1 += 8;
+        const int8x8_t va1x1 = vld1_s8(a1); a1 += 8;
+        const int8x8_t va2x0 = vld1_s8(a2); a2 += 8;
+        const int8x8_t va2x1 = vld1_s8(a2); a2 += 8;
+
+        const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+        int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+        int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0)));
+        const int8x8_t vb0123c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+        vprod1x0123c0 = vmlal_s8(vprod1x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+        vprod2x0123c0 = vmlal_s8(vprod2x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 0)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+        int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+        int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+        int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0)));
+        const int8x8_t vb4567c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+        vprod1x4567c0 = vmlal_s8(vprod1x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+        vprod2x4567c0 = vmlal_s8(vprod2x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 0)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+        int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+        int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+        int16x8_t vprod2x89ABc0 = vmull_s8(vb89ABc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0)));
+        const int8x8_t vb89ABc0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x89ABc0 = vmlal_s8(vprod0x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+        vprod1x89ABc0 = vmlal_s8(vprod1x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+        vprod2x89ABc0 = vmlal_s8(vprod2x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 0)));
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+        vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
+        vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0);
+        int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+        int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+        int16x8_t vprod2xCDEFc0 = vmull_s8(vbCDEFc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0)));
+        const int8x8_t vbCDEFc0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0xCDEFc0 = vmlal_s8(vprod0xCDEFc0, vbCDEFc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+        vprod1xCDEFc0 = vmlal_s8(vprod1xCDEFc0, vbCDEFc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+        vprod2xCDEFc0 = vmlal_s8(vprod2xCDEFc0, vbCDEFc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 0)));
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+        vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
+        vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0);
+        int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+        int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+        int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1)));
+        const int8x8_t vb0123c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x0123c1 = vmlal_s8(vprod0x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+        vprod1x0123c1 = vmlal_s8(vprod1x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+        vprod2x0123c1 = vmlal_s8(vprod2x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 1)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+        int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+        int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+        int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1)));
+        const int8x8_t vb4567c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x4567c1 = vmlal_s8(vprod0x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+        vprod1x4567c1 = vmlal_s8(vprod1x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+        vprod2x4567c1 = vmlal_s8(vprod2x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 1)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+        int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+        int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+        int16x8_t vprod2x89ABc1 = vmull_s8(vb89ABc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1)));
+        const int8x8_t vb89ABc1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x89ABc1 = vmlal_s8(vprod0x89ABc1, vb89ABc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+        vprod1x89ABc1 = vmlal_s8(vprod1x89ABc1, vb89ABc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+        vprod2x89ABc1 = vmlal_s8(vprod2x89ABc1, vb89ABc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 1)));
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+        vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
+        vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1);
+        int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+        int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+        int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1)));
+        const int8x8_t vbCDEFc1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0xCDEFc1 = vmlal_s8(vprod0xCDEFc1, vbCDEFc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+        vprod1xCDEFc1 = vmlal_s8(vprod1xCDEFc1, vbCDEFc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+        vprod2xCDEFc1 = vmlal_s8(vprod2xCDEFc1, vbCDEFc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 1)));
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+        vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
+        vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1);
+        int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+        int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+        int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 2)));
+        const int8x8_t vb0123c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x0123c2 = vmlal_s8(vprod0x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+        vprod1x0123c2 = vmlal_s8(vprod1x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+        vprod2x0123c2 = vmlal_s8(vprod2x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 2)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+        int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+        int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+        int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 2)));
+        const int8x8_t vb4567c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x4567c2 = vmlal_s8(vprod0x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+        vprod1x4567c2 = vmlal_s8(vprod1x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+        vprod2x4567c2 = vmlal_s8(vprod2x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 2)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+        int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+        int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+        int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 2)));
+        const int8x8_t vb89ABc2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x89ABc2 = vmlal_s8(vprod0x89ABc2, vb89ABc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+        vprod1x89ABc2 = vmlal_s8(vprod1x89ABc2, vb89ABc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+        vprod2x89ABc2 = vmlal_s8(vprod2x89ABc2, vb89ABc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 2)));
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+        vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
+        vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2);
+        int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+        int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+        int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 2)));
+        const int8x8_t vbCDEFc2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0xCDEFc2 = vmlal_s8(vprod0xCDEFc2, vbCDEFc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+        vprod1xCDEFc2 = vmlal_s8(vprod1xCDEFc2, vbCDEFc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+        vprod2xCDEFc2 = vmlal_s8(vprod2xCDEFc2, vbCDEFc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 2)));
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+        vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
+        vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2);
+        int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+        int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+        int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 3)));
+        const int8x8_t vb0123c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x0123c3 = vmlal_s8(vprod0x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+        vprod1x0123c3 = vmlal_s8(vprod1x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+        vprod2x0123c3 = vmlal_s8(vprod2x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 3)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
+        int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+        int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+        int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 3)));
+        const int8x8_t vb4567c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x4567c3 = vmlal_s8(vprod0x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+        vprod1x4567c3 = vmlal_s8(vprod1x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+        vprod2x4567c3 = vmlal_s8(vprod2x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 3)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
+        int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+        int16x8_t vprod1x89ABc3 = vmull_s8(vb89ABc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+        int16x8_t vprod2x89ABc3 = vmull_s8(vb89ABc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 3)));
+        const int8x8_t vb89ABc3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x89ABc3 = vmlal_s8(vprod0x89ABc3, vb89ABc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+        vprod1x89ABc3 = vmlal_s8(vprod1x89ABc3, vb89ABc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+        vprod2x89ABc3 = vmlal_s8(vprod2x89ABc3, vb89ABc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 3)));
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
+        vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc3);
+        vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3);
+        int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+        int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+        int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 3)));
+        const int8x8_t vbCDEFc3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0xCDEFc3 = vmlal_s8(vprod0xCDEFc3, vbCDEFc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+        vprod1xCDEFc3 = vmlal_s8(vprod1xCDEFc3, vbCDEFc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+        vprod2xCDEFc3 = vmlal_s8(vprod2xCDEFc3, vbCDEFc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 3)));
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
+        vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3);
+        vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3);
+
+        k -= 16 * sizeof(int8_t);
+      }
+
+      if (k >= 8 * sizeof(int8_t)) {
+        const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+        const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+
+        const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+        const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+        const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+        const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+        const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+        const int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
+        const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+        const int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
+        const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+        const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+        const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+        const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+        const int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        const int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+        const int16x8_t vprod1x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+        vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
+        vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
+        vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
+        vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc3);
+        const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+        const int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+        vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
+        vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
+        vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
+        vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3);
+        const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+        const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+        const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+        const int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
+        const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+        const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+        const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+        const int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
+        const int16x8_t vprod2x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+        const int16x8_t vprod2x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+        const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+        const int16x8_t vprod2x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+        vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0);
+        vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1);
+        vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2);
+        vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3);
+        const int16x8_t vprod2xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+        const int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+        const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+        const int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+        vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0);
+        vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1);
+        vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2);
+        vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3);
+
+        k -= 8 * sizeof(int8_t);
+      }
+
+      if XNN_UNLIKELY(k != 0) {
+        const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+        const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+        const int8x8_t va2 = vld1_s8(a2); a2 = (const int8_t*) ((uintptr_t) a2 + k);
+
+        const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+        const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+        const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+        const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+        const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+        const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+        const int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
+        const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
+        const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+        const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+        const int16x8_t vprod2x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+        vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0);
+        const int16x8_t vprod2xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+        vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0);
+
+        if (k > 2 * sizeof(int8_t)) {
+          const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+          vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+          const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+          vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+          const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+          vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+          const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+          vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+          const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+          vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+          const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+          vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+          const int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+          vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
+          const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+          vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
+          const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+          vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+          const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+          vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+          const int16x8_t vprod2x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+          vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1);
+          const int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+          vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1);
+
+          if (k > 4 * sizeof(int8_t)) {
+            const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+            const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+            vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+            const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+            vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+            const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+            vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+            const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+            vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+            const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+            vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+            const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+            vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+            const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+            vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
+            const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+            vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
+            const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+            vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+            const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+            vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+            const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+            vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2);
+            const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+            vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2);
+          }
+        }
+      }
+      p -= 3 * sizeof(void*);
+    } while (p != 0);
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+    vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc1x89AB = vqrdmulhq_s32(vacc1x89AB, vmultiplier);
+    vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+    vacc2x89AB = vqrdmulhq_s32(vacc2x89AB, vmultiplier);
+    vacc2xCDEF = vqrdmulhq_s32(vacc2xCDEF, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+    vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc1x89AB = vsraq_n_s32(vacc1x89AB, vbicq_s32(vacc1x89AB, vzero_shift_mask), 31);
+    vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+    vacc2x89AB = vsraq_n_s32(vacc2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31);
+    vacc2xCDEF = vsraq_n_s32(vacc2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_shift);
+    vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+    vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_shift);
+    vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+    int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
+    int8x16_t vout2x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc2x89ABCDEF);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+    int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
+    int8x16_t vout2x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc2x89ABCDEF));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout2x0123456789ABCDEF = vmaxq_s8(vout2x0123456789ABCDEF, voutput_min);
+    vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
+    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+
+    vout2x0123456789ABCDEF = vminq_s8(vout2x0123456789ABCDEF, voutput_max);
+    vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
+    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+
+    if (nc >= 16) {
+      vst1q_s8(c2 + 0, vout2x0123456789ABCDEF);
+      vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
+      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 16;
+    } else {
+      int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
+      int8x8_t vout2x01234567 = vget_low_s8(vout2x0123456789ABCDEF);
+      if (nc & 8) {
+        vst1_s8(c2, vout2x01234567); c2 += 8;
+        vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
+        vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
+        vout2x01234567 = vget_high_s8(vout2x0123456789ABCDEF);
+        vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
+      }
+      if (nc & 4) {
+        vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1_lane_s8(c2, vout2x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/3x16c2-minmax-neon-mull-padal-dup.c b/src/qs8-igemm/gen/3x16c2-minmax-neon-mull-padal-dup.c
new file mode 100644
index 0000000..c971f66
--- /dev/null
+++ b/src/qs8-igemm/gen/3x16c2-minmax-neon-mull-padal-dup.c
@@ -0,0 +1,433 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/c2-neon-mull-padal-dup.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (3 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc1x0123 = vacc0x0123;
+    int32x4_t vacc1x4567 = vacc0x4567;
+    int32x4_t vacc1x89AB = vacc0x89AB;
+    int32x4_t vacc1xCDEF = vacc0xCDEF;
+    int32x4_t vacc2x0123 = vacc0x0123;
+    int32x4_t vacc2x4567 = vacc0x4567;
+    int32x4_t vacc2x89AB = vacc0x89AB;
+    int32x4_t vacc2xCDEF = vacc0xCDEF;
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      a += 3;
+
+      size_t k = kc;
+
+
+      while (k >= 8 * sizeof(int8_t)) {
+        const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+        const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+
+        const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+        const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+        const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+        const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+        const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+        const int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
+        const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+        const int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
+        const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+        const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+        const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+        const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+        const int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        const int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+        const int16x8_t vprod1x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+        vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
+        vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
+        vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
+        vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc3);
+        const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+        const int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+        vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
+        vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
+        vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
+        vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3);
+        const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+        const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+        const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+        const int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
+        const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+        const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+        const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+        const int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
+        const int16x8_t vprod2x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+        const int16x8_t vprod2x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+        const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+        const int16x8_t vprod2x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+        vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0);
+        vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1);
+        vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2);
+        vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3);
+        const int16x8_t vprod2xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+        const int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+        const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+        const int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+        vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0);
+        vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1);
+        vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2);
+        vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3);
+
+        k -= 8 * sizeof(int8_t);
+      }
+
+      if XNN_UNLIKELY(k != 0) {
+        const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+        const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+        const int8x8_t va2 = vld1_s8(a2); a2 = (const int8_t*) ((uintptr_t) a2 + k);
+
+        const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+        const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+        const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+        const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+        const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+        const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+        const int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
+        const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
+        const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+        const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+        const int16x8_t vprod2x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+        vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0);
+        const int16x8_t vprod2xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+        vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0);
+
+        if (k > 2 * sizeof(int8_t)) {
+          const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+          vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+          const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+          vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+          const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+          vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+          const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+          vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+          const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+          vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+          const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+          vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+          const int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+          vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
+          const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+          vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
+          const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+          vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+          const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+          vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+          const int16x8_t vprod2x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+          vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1);
+          const int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+          vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1);
+
+          if (k > 4 * sizeof(int8_t)) {
+            const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+            const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+            vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+            const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+            vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+            const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+            vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+            const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+            vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+            const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+            vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+            const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+            vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+            const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+            vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
+            const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+            vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
+            const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+            vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+            const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+            vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+            const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+            vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2);
+            const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+            vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2);
+          }
+        }
+      }
+      p -= 3 * sizeof(void*);
+    } while (p != 0);
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+    vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc1x89AB = vqrdmulhq_s32(vacc1x89AB, vmultiplier);
+    vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+    vacc2x89AB = vqrdmulhq_s32(vacc2x89AB, vmultiplier);
+    vacc2xCDEF = vqrdmulhq_s32(vacc2xCDEF, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+    vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc1x89AB = vsraq_n_s32(vacc1x89AB, vbicq_s32(vacc1x89AB, vzero_shift_mask), 31);
+    vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+    vacc2x89AB = vsraq_n_s32(vacc2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31);
+    vacc2xCDEF = vsraq_n_s32(vacc2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_shift);
+    vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+    vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_shift);
+    vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+    int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
+    int8x16_t vout2x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc2x89ABCDEF);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+    int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
+    int8x16_t vout2x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc2x89ABCDEF));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout2x0123456789ABCDEF = vmaxq_s8(vout2x0123456789ABCDEF, voutput_min);
+    vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
+    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+
+    vout2x0123456789ABCDEF = vminq_s8(vout2x0123456789ABCDEF, voutput_max);
+    vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
+    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+
+    if (nc >= 16) {
+      vst1q_s8(c2 + 0, vout2x0123456789ABCDEF);
+      vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
+      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 16;
+    } else {
+      int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
+      int8x8_t vout2x01234567 = vget_low_s8(vout2x0123456789ABCDEF);
+      if (nc & 8) {
+        vst1_s8(c2, vout2x01234567); c2 += 8;
+        vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
+        vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
+        vout2x01234567 = vget_high_s8(vout2x0123456789ABCDEF);
+        vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
+      }
+      if (nc & 4) {
+        vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1_lane_s8(c2, vout2x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/3x16c8-minmax-avx512skx.c b/src/qs8-igemm/gen/3x16c8-minmax-avx512skx.c
index 3fe63e4..f9708b7 100644
--- a/src/qs8-igemm/gen/3x16c8-minmax-avx512skx.c
+++ b/src/qs8-igemm/gen/3x16c8-minmax-avx512skx.c
@@ -13,6 +13,7 @@
 
 #include <xnnpack/igemm.h>
 #include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx(
@@ -38,6 +39,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
diff --git a/src/qs8-igemm/gen/3x16c8-minmax-neon-mlal-padal.c b/src/qs8-igemm/gen/3x16c8-minmax-neon-mlal-padal.c
new file mode 100644
index 0000000..916f7bf
--- /dev/null
+++ b/src/qs8-igemm/gen/3x16c8-minmax-neon-mlal-padal.c
@@ -0,0 +1,681 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/c8-neon-mull-padal.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (3 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+
+  do {
+    int32x4_t vacc0x0 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x1 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x2 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x3 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x4 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x5 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x8 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x9 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x10 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x11 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x12 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x13 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x14 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x15 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc1x0 = vacc0x0;
+    int32x4_t vacc1x1 = vacc0x1;
+    int32x4_t vacc1x2 = vacc0x2;
+    int32x4_t vacc1x3 = vacc0x3;
+    int32x4_t vacc1x4 = vacc0x4;
+    int32x4_t vacc1x5 = vacc0x5;
+    int32x4_t vacc1x6 = vacc0x6;
+    int32x4_t vacc1x7 = vacc0x7;
+    int32x4_t vacc1x8 = vacc0x8;
+    int32x4_t vacc1x9 = vacc0x9;
+    int32x4_t vacc1x10 = vacc0x10;
+    int32x4_t vacc1x11 = vacc0x11;
+    int32x4_t vacc1x12 = vacc0x12;
+    int32x4_t vacc1x13 = vacc0x13;
+    int32x4_t vacc1x14 = vacc0x14;
+    int32x4_t vacc1x15 = vacc0x15;
+    int32x4_t vacc2x0 = vacc0x0;
+    int32x4_t vacc2x1 = vacc0x1;
+    int32x4_t vacc2x2 = vacc0x2;
+    int32x4_t vacc2x3 = vacc0x3;
+    int32x4_t vacc2x4 = vacc0x4;
+    int32x4_t vacc2x5 = vacc0x5;
+    int32x4_t vacc2x6 = vacc0x6;
+    int32x4_t vacc2x7 = vacc0x7;
+    int32x4_t vacc2x8 = vacc0x8;
+    int32x4_t vacc2x9 = vacc0x9;
+    int32x4_t vacc2x10 = vacc0x10;
+    int32x4_t vacc2x11 = vacc0x11;
+    int32x4_t vacc2x12 = vacc0x12;
+    int32x4_t vacc2x13 = vacc0x13;
+    int32x4_t vacc2x14 = vacc0x14;
+    int32x4_t vacc2x15 = vacc0x15;
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      a += 3;
+
+      size_t k = kc;
+      // 2x partial unrolled loop to load 16 bytes at a time using MLA.
+      while (k >= 16 * sizeof(int8_t)) {
+        const int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va0x1 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va1x0 = vld1_s8(a1); a1 += 8;
+        const int8x8_t va1x1 = vld1_s8(a1); a1 += 8;
+        const int8x8_t va2x0 = vld1_s8(a2); a2 += 8;
+        const int8x8_t va2x1 = vld1_s8(a2); a2 += 8;
+
+        const int8x8_t vb0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb4x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb5x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb6x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb7x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb8x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb9x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb10x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb11x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb12x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb13x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb14x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb15x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+
+        const int8x8_t vb0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x0 = vmull_s8(vb0x0, va0x0);
+        int16x8_t vprod1x0 = vmull_s8(vb0x0, va1x0);
+        int16x8_t vprod2x0 = vmull_s8(vb0x0, va2x0);
+        vprod0x0 = vmlal_s8(vprod0x0, vb0x1, va0x1);
+        vprod1x0 = vmlal_s8(vprod1x0, vb0x1, va1x1);
+        vprod2x0 = vmlal_s8(vprod2x0, vb0x1, va2x1);
+        vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+        vacc1x0 = vpadalq_s16(vacc1x0, vprod1x0);
+        vacc2x0 = vpadalq_s16(vacc2x0, vprod2x0);
+        const int8x8_t vb1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x1 = vmull_s8(vb1x0, va0x0);
+        int16x8_t vprod1x1 = vmull_s8(vb1x0, va1x0);
+        int16x8_t vprod2x1 = vmull_s8(vb1x0, va2x0);
+        vprod0x1 = vmlal_s8(vprod0x1, vb1x1, va0x1);
+        vprod1x1 = vmlal_s8(vprod1x1, vb1x1, va1x1);
+        vprod2x1 = vmlal_s8(vprod2x1, vb1x1, va2x1);
+        vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+        vacc1x1 = vpadalq_s16(vacc1x1, vprod1x1);
+        vacc2x1 = vpadalq_s16(vacc2x1, vprod2x1);
+        const int8x8_t vb2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x2 = vmull_s8(vb2x0, va0x0);
+        int16x8_t vprod1x2 = vmull_s8(vb2x0, va1x0);
+        int16x8_t vprod2x2 = vmull_s8(vb2x0, va2x0);
+        vprod0x2 = vmlal_s8(vprod0x2, vb2x1, va0x1);
+        vprod1x2 = vmlal_s8(vprod1x2, vb2x1, va1x1);
+        vprod2x2 = vmlal_s8(vprod2x2, vb2x1, va2x1);
+        vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+        vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2);
+        vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2);
+        const int8x8_t vb3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x3 = vmull_s8(vb3x0, va0x0);
+        int16x8_t vprod1x3 = vmull_s8(vb3x0, va1x0);
+        int16x8_t vprod2x3 = vmull_s8(vb3x0, va2x0);
+        vprod0x3 = vmlal_s8(vprod0x3, vb3x1, va0x1);
+        vprod1x3 = vmlal_s8(vprod1x3, vb3x1, va1x1);
+        vprod2x3 = vmlal_s8(vprod2x3, vb3x1, va2x1);
+        vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+        vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3);
+        vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3);
+        const int8x8_t vb4x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x4 = vmull_s8(vb4x0, va0x0);
+        int16x8_t vprod1x4 = vmull_s8(vb4x0, va1x0);
+        int16x8_t vprod2x4 = vmull_s8(vb4x0, va2x0);
+        vprod0x4 = vmlal_s8(vprod0x4, vb4x1, va0x1);
+        vprod1x4 = vmlal_s8(vprod1x4, vb4x1, va1x1);
+        vprod2x4 = vmlal_s8(vprod2x4, vb4x1, va2x1);
+        vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+        vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4);
+        vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4);
+        const int8x8_t vb5x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x5 = vmull_s8(vb5x0, va0x0);
+        int16x8_t vprod1x5 = vmull_s8(vb5x0, va1x0);
+        int16x8_t vprod2x5 = vmull_s8(vb5x0, va2x0);
+        vprod0x5 = vmlal_s8(vprod0x5, vb5x1, va0x1);
+        vprod1x5 = vmlal_s8(vprod1x5, vb5x1, va1x1);
+        vprod2x5 = vmlal_s8(vprod2x5, vb5x1, va2x1);
+        vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+        vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5);
+        vacc2x5 = vpadalq_s16(vacc2x5, vprod2x5);
+        const int8x8_t vb6x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x6 = vmull_s8(vb6x0, va0x0);
+        int16x8_t vprod1x6 = vmull_s8(vb6x0, va1x0);
+        int16x8_t vprod2x6 = vmull_s8(vb6x0, va2x0);
+        vprod0x6 = vmlal_s8(vprod0x6, vb6x1, va0x1);
+        vprod1x6 = vmlal_s8(vprod1x6, vb6x1, va1x1);
+        vprod2x6 = vmlal_s8(vprod2x6, vb6x1, va2x1);
+        vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+        vacc1x6 = vpadalq_s16(vacc1x6, vprod1x6);
+        vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6);
+        const int8x8_t vb7x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x7 = vmull_s8(vb7x0, va0x0);
+        int16x8_t vprod1x7 = vmull_s8(vb7x0, va1x0);
+        int16x8_t vprod2x7 = vmull_s8(vb7x0, va2x0);
+        vprod0x7 = vmlal_s8(vprod0x7, vb7x1, va0x1);
+        vprod1x7 = vmlal_s8(vprod1x7, vb7x1, va1x1);
+        vprod2x7 = vmlal_s8(vprod2x7, vb7x1, va2x1);
+        vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+        vacc1x7 = vpadalq_s16(vacc1x7, vprod1x7);
+        vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7);
+        const int8x8_t vb8x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x8 = vmull_s8(vb8x0, va0x0);
+        int16x8_t vprod1x8 = vmull_s8(vb8x0, va1x0);
+        int16x8_t vprod2x8 = vmull_s8(vb8x0, va2x0);
+        vprod0x8 = vmlal_s8(vprod0x8, vb8x1, va0x1);
+        vprod1x8 = vmlal_s8(vprod1x8, vb8x1, va1x1);
+        vprod2x8 = vmlal_s8(vprod2x8, vb8x1, va2x1);
+        vacc0x8 = vpadalq_s16(vacc0x8, vprod0x8);
+        vacc1x8 = vpadalq_s16(vacc1x8, vprod1x8);
+        vacc2x8 = vpadalq_s16(vacc2x8, vprod2x8);
+        const int8x8_t vb9x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x9 = vmull_s8(vb9x0, va0x0);
+        int16x8_t vprod1x9 = vmull_s8(vb9x0, va1x0);
+        int16x8_t vprod2x9 = vmull_s8(vb9x0, va2x0);
+        vprod0x9 = vmlal_s8(vprod0x9, vb9x1, va0x1);
+        vprod1x9 = vmlal_s8(vprod1x9, vb9x1, va1x1);
+        vprod2x9 = vmlal_s8(vprod2x9, vb9x1, va2x1);
+        vacc0x9 = vpadalq_s16(vacc0x9, vprod0x9);
+        vacc1x9 = vpadalq_s16(vacc1x9, vprod1x9);
+        vacc2x9 = vpadalq_s16(vacc2x9, vprod2x9);
+        const int8x8_t vb10x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x10 = vmull_s8(vb10x0, va0x0);
+        int16x8_t vprod1x10 = vmull_s8(vb10x0, va1x0);
+        int16x8_t vprod2x10 = vmull_s8(vb10x0, va2x0);
+        vprod0x10 = vmlal_s8(vprod0x10, vb10x1, va0x1);
+        vprod1x10 = vmlal_s8(vprod1x10, vb10x1, va1x1);
+        vprod2x10 = vmlal_s8(vprod2x10, vb10x1, va2x1);
+        vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10);
+        vacc1x10 = vpadalq_s16(vacc1x10, vprod1x10);
+        vacc2x10 = vpadalq_s16(vacc2x10, vprod2x10);
+        const int8x8_t vb11x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x11 = vmull_s8(vb11x0, va0x0);
+        int16x8_t vprod1x11 = vmull_s8(vb11x0, va1x0);
+        int16x8_t vprod2x11 = vmull_s8(vb11x0, va2x0);
+        vprod0x11 = vmlal_s8(vprod0x11, vb11x1, va0x1);
+        vprod1x11 = vmlal_s8(vprod1x11, vb11x1, va1x1);
+        vprod2x11 = vmlal_s8(vprod2x11, vb11x1, va2x1);
+        vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11);
+        vacc1x11 = vpadalq_s16(vacc1x11, vprod1x11);
+        vacc2x11 = vpadalq_s16(vacc2x11, vprod2x11);
+        const int8x8_t vb12x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x12 = vmull_s8(vb12x0, va0x0);
+        int16x8_t vprod1x12 = vmull_s8(vb12x0, va1x0);
+        int16x8_t vprod2x12 = vmull_s8(vb12x0, va2x0);
+        vprod0x12 = vmlal_s8(vprod0x12, vb12x1, va0x1);
+        vprod1x12 = vmlal_s8(vprod1x12, vb12x1, va1x1);
+        vprod2x12 = vmlal_s8(vprod2x12, vb12x1, va2x1);
+        vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12);
+        vacc1x12 = vpadalq_s16(vacc1x12, vprod1x12);
+        vacc2x12 = vpadalq_s16(vacc2x12, vprod2x12);
+        const int8x8_t vb13x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x13 = vmull_s8(vb13x0, va0x0);
+        int16x8_t vprod1x13 = vmull_s8(vb13x0, va1x0);
+        int16x8_t vprod2x13 = vmull_s8(vb13x0, va2x0);
+        vprod0x13 = vmlal_s8(vprod0x13, vb13x1, va0x1);
+        vprod1x13 = vmlal_s8(vprod1x13, vb13x1, va1x1);
+        vprod2x13 = vmlal_s8(vprod2x13, vb13x1, va2x1);
+        vacc0x13 = vpadalq_s16(vacc0x13, vprod0x13);
+        vacc1x13 = vpadalq_s16(vacc1x13, vprod1x13);
+        vacc2x13 = vpadalq_s16(vacc2x13, vprod2x13);
+        const int8x8_t vb14x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x14 = vmull_s8(vb14x0, va0x0);
+        int16x8_t vprod1x14 = vmull_s8(vb14x0, va1x0);
+        int16x8_t vprod2x14 = vmull_s8(vb14x0, va2x0);
+        vprod0x14 = vmlal_s8(vprod0x14, vb14x1, va0x1);
+        vprod1x14 = vmlal_s8(vprod1x14, vb14x1, va1x1);
+        vprod2x14 = vmlal_s8(vprod2x14, vb14x1, va2x1);
+        vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14);
+        vacc1x14 = vpadalq_s16(vacc1x14, vprod1x14);
+        vacc2x14 = vpadalq_s16(vacc2x14, vprod2x14);
+        const int8x8_t vb15x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x15 = vmull_s8(vb15x0, va0x0);
+        int16x8_t vprod1x15 = vmull_s8(vb15x0, va1x0);
+        int16x8_t vprod2x15 = vmull_s8(vb15x0, va2x0);
+        vprod0x15 = vmlal_s8(vprod0x15, vb15x1, va0x1);
+        vprod1x15 = vmlal_s8(vprod1x15, vb15x1, va1x1);
+        vprod2x15 = vmlal_s8(vprod2x15, vb15x1, va2x1);
+        vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15);
+        vacc1x15 = vpadalq_s16(vacc1x15, vprod1x15);
+        vacc2x15 = vpadalq_s16(vacc2x15, vprod2x15);
+
+        k -= 16 * sizeof(int8_t);
+      }
+
+      // Handle 8 bytes at a time using MUL.
+      if (k > 0) {
+        const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+        const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+
+        const int8x8_t vb0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x0 = vmull_s8(vb0, va0);
+        const int16x8_t vprod1x0 = vmull_s8(vb0, va1);
+        const int16x8_t vprod2x0 = vmull_s8(vb0, va2);
+        vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+        vacc1x0 = vpadalq_s16(vacc1x0, vprod1x0);
+        vacc2x0 = vpadalq_s16(vacc2x0, vprod2x0);
+        const int8x8_t vb1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x1 = vmull_s8(vb1, va0);
+        const int16x8_t vprod1x1 = vmull_s8(vb1, va1);
+        const int16x8_t vprod2x1 = vmull_s8(vb1, va2);
+        vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+        vacc1x1 = vpadalq_s16(vacc1x1, vprod1x1);
+        vacc2x1 = vpadalq_s16(vacc2x1, vprod2x1);
+        const int8x8_t vb2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x2 = vmull_s8(vb2, va0);
+        const int16x8_t vprod1x2 = vmull_s8(vb2, va1);
+        const int16x8_t vprod2x2 = vmull_s8(vb2, va2);
+        vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+        vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2);
+        vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2);
+        const int8x8_t vb3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x3 = vmull_s8(vb3, va0);
+        const int16x8_t vprod1x3 = vmull_s8(vb3, va1);
+        const int16x8_t vprod2x3 = vmull_s8(vb3, va2);
+        vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+        vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3);
+        vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3);
+        const int8x8_t vb4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x4 = vmull_s8(vb4, va0);
+        const int16x8_t vprod1x4 = vmull_s8(vb4, va1);
+        const int16x8_t vprod2x4 = vmull_s8(vb4, va2);
+        vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+        vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4);
+        vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4);
+        const int8x8_t vb5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x5 = vmull_s8(vb5, va0);
+        const int16x8_t vprod1x5 = vmull_s8(vb5, va1);
+        const int16x8_t vprod2x5 = vmull_s8(vb5, va2);
+        vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+        vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5);
+        vacc2x5 = vpadalq_s16(vacc2x5, vprod2x5);
+        const int8x8_t vb6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x6 = vmull_s8(vb6, va0);
+        const int16x8_t vprod1x6 = vmull_s8(vb6, va1);
+        const int16x8_t vprod2x6 = vmull_s8(vb6, va2);
+        vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+        vacc1x6 = vpadalq_s16(vacc1x6, vprod1x6);
+        vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6);
+        const int8x8_t vb7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x7 = vmull_s8(vb7, va0);
+        const int16x8_t vprod1x7 = vmull_s8(vb7, va1);
+        const int16x8_t vprod2x7 = vmull_s8(vb7, va2);
+        vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+        vacc1x7 = vpadalq_s16(vacc1x7, vprod1x7);
+        vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7);
+        const int8x8_t vb8 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x8 = vmull_s8(vb8, va0);
+        const int16x8_t vprod1x8 = vmull_s8(vb8, va1);
+        const int16x8_t vprod2x8 = vmull_s8(vb8, va2);
+        vacc0x8 = vpadalq_s16(vacc0x8, vprod0x8);
+        vacc1x8 = vpadalq_s16(vacc1x8, vprod1x8);
+        vacc2x8 = vpadalq_s16(vacc2x8, vprod2x8);
+        const int8x8_t vb9 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x9 = vmull_s8(vb9, va0);
+        const int16x8_t vprod1x9 = vmull_s8(vb9, va1);
+        const int16x8_t vprod2x9 = vmull_s8(vb9, va2);
+        vacc0x9 = vpadalq_s16(vacc0x9, vprod0x9);
+        vacc1x9 = vpadalq_s16(vacc1x9, vprod1x9);
+        vacc2x9 = vpadalq_s16(vacc2x9, vprod2x9);
+        const int8x8_t vb10 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x10 = vmull_s8(vb10, va0);
+        const int16x8_t vprod1x10 = vmull_s8(vb10, va1);
+        const int16x8_t vprod2x10 = vmull_s8(vb10, va2);
+        vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10);
+        vacc1x10 = vpadalq_s16(vacc1x10, vprod1x10);
+        vacc2x10 = vpadalq_s16(vacc2x10, vprod2x10);
+        const int8x8_t vb11 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x11 = vmull_s8(vb11, va0);
+        const int16x8_t vprod1x11 = vmull_s8(vb11, va1);
+        const int16x8_t vprod2x11 = vmull_s8(vb11, va2);
+        vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11);
+        vacc1x11 = vpadalq_s16(vacc1x11, vprod1x11);
+        vacc2x11 = vpadalq_s16(vacc2x11, vprod2x11);
+        const int8x8_t vb12 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x12 = vmull_s8(vb12, va0);
+        const int16x8_t vprod1x12 = vmull_s8(vb12, va1);
+        const int16x8_t vprod2x12 = vmull_s8(vb12, va2);
+        vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12);
+        vacc1x12 = vpadalq_s16(vacc1x12, vprod1x12);
+        vacc2x12 = vpadalq_s16(vacc2x12, vprod2x12);
+        const int8x8_t vb13 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x13 = vmull_s8(vb13, va0);
+        const int16x8_t vprod1x13 = vmull_s8(vb13, va1);
+        const int16x8_t vprod2x13 = vmull_s8(vb13, va2);
+        vacc0x13 = vpadalq_s16(vacc0x13, vprod0x13);
+        vacc1x13 = vpadalq_s16(vacc1x13, vprod1x13);
+        vacc2x13 = vpadalq_s16(vacc2x13, vprod2x13);
+        const int8x8_t vb14 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x14 = vmull_s8(vb14, va0);
+        const int16x8_t vprod1x14 = vmull_s8(vb14, va1);
+        const int16x8_t vprod2x14 = vmull_s8(vb14, va2);
+        vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14);
+        vacc1x14 = vpadalq_s16(vacc1x14, vprod1x14);
+        vacc2x14 = vpadalq_s16(vacc2x14, vprod2x14);
+        const int8x8_t vb15 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x15 = vmull_s8(vb15, va0);
+        const int16x8_t vprod1x15 = vmull_s8(vb15, va1);
+        const int16x8_t vprod2x15 = vmull_s8(vb15, va2);
+        vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15);
+        vacc1x15 = vpadalq_s16(vacc1x15, vprod1x15);
+        vacc2x15 = vpadalq_s16(vacc2x15, vprod2x15);
+
+        k -= 8 * sizeof(int8_t);
+      }
+
+      p -= 3 * sizeof(void*);
+    } while (p != 0);
+
+#if XNN_ARCH_ARM64
+    const int32x4_t vsum0x01 = vpaddq_s32(vacc0x0, vacc0x1);
+    const int32x4_t vsum0x23 = vpaddq_s32(vacc0x2, vacc0x3);
+    const int32x4_t vsum0x45 = vpaddq_s32(vacc0x4, vacc0x5);
+    const int32x4_t vsum0x67 = vpaddq_s32(vacc0x6, vacc0x7);
+    const int32x4_t vsum0x89 = vpaddq_s32(vacc0x8, vacc0x9);
+    const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11);
+    const int32x4_t vsum0xCD = vpaddq_s32(vacc0x12, vacc0x13);
+    const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15);
+    const int32x4_t vsum1x01 = vpaddq_s32(vacc1x0, vacc1x1);
+    const int32x4_t vsum1x23 = vpaddq_s32(vacc1x2, vacc1x3);
+    const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5);
+    const int32x4_t vsum1x67 = vpaddq_s32(vacc1x6, vacc1x7);
+    const int32x4_t vsum1x89 = vpaddq_s32(vacc1x8, vacc1x9);
+    const int32x4_t vsum1xAB = vpaddq_s32(vacc1x10, vacc1x11);
+    const int32x4_t vsum1xCD = vpaddq_s32(vacc1x12, vacc1x13);
+    const int32x4_t vsum1xEF = vpaddq_s32(vacc1x14, vacc1x15);
+    const int32x4_t vsum2x01 = vpaddq_s32(vacc2x0, vacc2x1);
+    const int32x4_t vsum2x23 = vpaddq_s32(vacc2x2, vacc2x3);
+    const int32x4_t vsum2x45 = vpaddq_s32(vacc2x4, vacc2x5);
+    const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7);
+    const int32x4_t vsum2x89 = vpaddq_s32(vacc2x8, vacc2x9);
+    const int32x4_t vsum2xAB = vpaddq_s32(vacc2x10, vacc2x11);
+    const int32x4_t vsum2xCD = vpaddq_s32(vacc2x12, vacc2x13);
+    const int32x4_t vsum2xEF = vpaddq_s32(vacc2x14, vacc2x15);
+    int32x4_t vacc0x0123 = vpaddq_s32(vsum0x01, vsum0x23);
+    int32x4_t vacc0x4567 = vpaddq_s32(vsum0x45, vsum0x67);
+    int32x4_t vacc0x89AB = vpaddq_s32(vsum0x89, vsum0xAB);
+    int32x4_t vacc0xCDEF = vpaddq_s32(vsum0xCD, vsum0xEF);
+    int32x4_t vacc1x0123 = vpaddq_s32(vsum1x01, vsum1x23);
+    int32x4_t vacc1x4567 = vpaddq_s32(vsum1x45, vsum1x67);
+    int32x4_t vacc1x89AB = vpaddq_s32(vsum1x89, vsum1xAB);
+    int32x4_t vacc1xCDEF = vpaddq_s32(vsum1xCD, vsum1xEF);
+    int32x4_t vacc2x0123 = vpaddq_s32(vsum2x01, vsum2x23);
+    int32x4_t vacc2x4567 = vpaddq_s32(vsum2x45, vsum2x67);
+    int32x4_t vacc2x89AB = vpaddq_s32(vsum2x89, vsum2xAB);
+    int32x4_t vacc2xCDEF = vpaddq_s32(vsum2xCD, vsum2xEF);
+#else
+    const int32x2_t vpsum0x0 = vadd_s32(vget_low_s32(vacc0x0), vget_high_s32(vacc0x0));
+    const int32x2_t vpsum0x1 = vadd_s32(vget_low_s32(vacc0x1), vget_high_s32(vacc0x1));
+    const int32x2_t vpsum0x2 = vadd_s32(vget_low_s32(vacc0x2), vget_high_s32(vacc0x2));
+    const int32x2_t vpsum0x3 = vadd_s32(vget_low_s32(vacc0x3), vget_high_s32(vacc0x3));
+    const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1);
+    const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3);
+    int32x4_t vacc0x0123 = vcombine_s32(vsum0x01, vsum0x23 );
+    const int32x2_t vpsum0x4 = vadd_s32(vget_low_s32(vacc0x4), vget_high_s32(vacc0x4));
+    const int32x2_t vpsum0x5 = vadd_s32(vget_low_s32(vacc0x5), vget_high_s32(vacc0x5));
+    const int32x2_t vpsum0x6 = vadd_s32(vget_low_s32(vacc0x6), vget_high_s32(vacc0x6));
+    const int32x2_t vpsum0x7 = vadd_s32(vget_low_s32(vacc0x7), vget_high_s32(vacc0x7));
+    const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5);
+    const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7);
+    int32x4_t vacc0x4567 = vcombine_s32(vsum0x45, vsum0x67 );
+    const int32x2_t vpsum0x8 = vadd_s32(vget_low_s32(vacc0x8), vget_high_s32(vacc0x8));
+    const int32x2_t vpsum0x9 = vadd_s32(vget_low_s32(vacc0x9), vget_high_s32(vacc0x9));
+    const int32x2_t vpsum0xA = vadd_s32(vget_low_s32(vacc0x10), vget_high_s32(vacc0x10));
+    const int32x2_t vpsum0xB = vadd_s32(vget_low_s32(vacc0x11), vget_high_s32(vacc0x11));
+    const int32x2_t vsum0x89 = vpadd_s32(vpsum0x8, vpsum0x9);
+    const int32x2_t vsum0xAB = vpadd_s32(vpsum0xA, vpsum0xB);
+    int32x4_t vacc0x89AB = vcombine_s32(vsum0x89, vsum0xAB );
+    const int32x2_t vpsum0xC = vadd_s32(vget_low_s32(vacc0x12), vget_high_s32(vacc0x12));
+    const int32x2_t vpsum0xD = vadd_s32(vget_low_s32(vacc0x13), vget_high_s32(vacc0x13));
+    const int32x2_t vpsum0xE = vadd_s32(vget_low_s32(vacc0x14), vget_high_s32(vacc0x14));
+    const int32x2_t vpsum0xF = vadd_s32(vget_low_s32(vacc0x15), vget_high_s32(vacc0x15));
+    const int32x2_t vsum0xCD = vpadd_s32(vpsum0xC, vpsum0xD);
+    const int32x2_t vsum0xEF = vpadd_s32(vpsum0xE, vpsum0xF);
+    int32x4_t vacc0xCDEF = vcombine_s32(vsum0xCD, vsum0xEF );
+    const int32x2_t vpsum1x0 = vadd_s32(vget_low_s32(vacc1x0), vget_high_s32(vacc1x0));
+    const int32x2_t vpsum1x1 = vadd_s32(vget_low_s32(vacc1x1), vget_high_s32(vacc1x1));
+    const int32x2_t vpsum1x2 = vadd_s32(vget_low_s32(vacc1x2), vget_high_s32(vacc1x2));
+    const int32x2_t vpsum1x3 = vadd_s32(vget_low_s32(vacc1x3), vget_high_s32(vacc1x3));
+    const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1);
+    const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3);
+    int32x4_t vacc1x0123 = vcombine_s32(vsum1x01, vsum1x23 );
+    const int32x2_t vpsum1x4 = vadd_s32(vget_low_s32(vacc1x4), vget_high_s32(vacc1x4));
+    const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5));
+    const int32x2_t vpsum1x6 = vadd_s32(vget_low_s32(vacc1x6), vget_high_s32(vacc1x6));
+    const int32x2_t vpsum1x7 = vadd_s32(vget_low_s32(vacc1x7), vget_high_s32(vacc1x7));
+    const int32x2_t vsum1x45 = vpadd_s32(vpsum1x4, vpsum1x5);
+    const int32x2_t vsum1x67 = vpadd_s32(vpsum1x6, vpsum1x7);
+    int32x4_t vacc1x4567 = vcombine_s32(vsum1x45, vsum1x67 );
+    const int32x2_t vpsum1x8 = vadd_s32(vget_low_s32(vacc1x8), vget_high_s32(vacc1x8));
+    const int32x2_t vpsum1x9 = vadd_s32(vget_low_s32(vacc1x9), vget_high_s32(vacc1x9));
+    const int32x2_t vpsum1xA = vadd_s32(vget_low_s32(vacc1x10), vget_high_s32(vacc1x10));
+    const int32x2_t vpsum1xB = vadd_s32(vget_low_s32(vacc1x11), vget_high_s32(vacc1x11));
+    const int32x2_t vsum1x89 = vpadd_s32(vpsum1x8, vpsum1x9);
+    const int32x2_t vsum1xAB = vpadd_s32(vpsum1xA, vpsum1xB);
+    int32x4_t vacc1x89AB = vcombine_s32(vsum1x89, vsum1xAB );
+    const int32x2_t vpsum1xC = vadd_s32(vget_low_s32(vacc1x12), vget_high_s32(vacc1x12));
+    const int32x2_t vpsum1xD = vadd_s32(vget_low_s32(vacc1x13), vget_high_s32(vacc1x13));
+    const int32x2_t vpsum1xE = vadd_s32(vget_low_s32(vacc1x14), vget_high_s32(vacc1x14));
+    const int32x2_t vpsum1xF = vadd_s32(vget_low_s32(vacc1x15), vget_high_s32(vacc1x15));
+    const int32x2_t vsum1xCD = vpadd_s32(vpsum1xC, vpsum1xD);
+    const int32x2_t vsum1xEF = vpadd_s32(vpsum1xE, vpsum1xF);
+    int32x4_t vacc1xCDEF = vcombine_s32(vsum1xCD, vsum1xEF );
+    const int32x2_t vpsum2x0 = vadd_s32(vget_low_s32(vacc2x0), vget_high_s32(vacc2x0));
+    const int32x2_t vpsum2x1 = vadd_s32(vget_low_s32(vacc2x1), vget_high_s32(vacc2x1));
+    const int32x2_t vpsum2x2 = vadd_s32(vget_low_s32(vacc2x2), vget_high_s32(vacc2x2));
+    const int32x2_t vpsum2x3 = vadd_s32(vget_low_s32(vacc2x3), vget_high_s32(vacc2x3));
+    const int32x2_t vsum2x01 = vpadd_s32(vpsum2x0, vpsum2x1);
+    const int32x2_t vsum2x23 = vpadd_s32(vpsum2x2, vpsum2x3);
+    int32x4_t vacc2x0123 = vcombine_s32(vsum2x01, vsum2x23 );
+    const int32x2_t vpsum2x4 = vadd_s32(vget_low_s32(vacc2x4), vget_high_s32(vacc2x4));
+    const int32x2_t vpsum2x5 = vadd_s32(vget_low_s32(vacc2x5), vget_high_s32(vacc2x5));
+    const int32x2_t vpsum2x6 = vadd_s32(vget_low_s32(vacc2x6), vget_high_s32(vacc2x6));
+    const int32x2_t vpsum2x7 = vadd_s32(vget_low_s32(vacc2x7), vget_high_s32(vacc2x7));
+    const int32x2_t vsum2x45 = vpadd_s32(vpsum2x4, vpsum2x5);
+    const int32x2_t vsum2x67 = vpadd_s32(vpsum2x6, vpsum2x7);
+    int32x4_t vacc2x4567 = vcombine_s32(vsum2x45, vsum2x67 );
+    const int32x2_t vpsum2x8 = vadd_s32(vget_low_s32(vacc2x8), vget_high_s32(vacc2x8));
+    const int32x2_t vpsum2x9 = vadd_s32(vget_low_s32(vacc2x9), vget_high_s32(vacc2x9));
+    const int32x2_t vpsum2xA = vadd_s32(vget_low_s32(vacc2x10), vget_high_s32(vacc2x10));
+    const int32x2_t vpsum2xB = vadd_s32(vget_low_s32(vacc2x11), vget_high_s32(vacc2x11));
+    const int32x2_t vsum2x89 = vpadd_s32(vpsum2x8, vpsum2x9);
+    const int32x2_t vsum2xAB = vpadd_s32(vpsum2xA, vpsum2xB);
+    int32x4_t vacc2x89AB = vcombine_s32(vsum2x89, vsum2xAB );
+    const int32x2_t vpsum2xC = vadd_s32(vget_low_s32(vacc2x12), vget_high_s32(vacc2x12));
+    const int32x2_t vpsum2xD = vadd_s32(vget_low_s32(vacc2x13), vget_high_s32(vacc2x13));
+    const int32x2_t vpsum2xE = vadd_s32(vget_low_s32(vacc2x14), vget_high_s32(vacc2x14));
+    const int32x2_t vpsum2xF = vadd_s32(vget_low_s32(vacc2x15), vget_high_s32(vacc2x15));
+    const int32x2_t vsum2xCD = vpadd_s32(vpsum2xC, vpsum2xD);
+    const int32x2_t vsum2xEF = vpadd_s32(vpsum2xE, vpsum2xF);
+    int32x4_t vacc2xCDEF = vcombine_s32(vsum2xCD, vsum2xEF );
+#endif
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+    vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc1x89AB = vqrdmulhq_s32(vacc1x89AB, vmultiplier);
+    vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+    vacc2x89AB = vqrdmulhq_s32(vacc2x89AB, vmultiplier);
+    vacc2xCDEF = vqrdmulhq_s32(vacc2xCDEF, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+    vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc1x89AB = vsraq_n_s32(vacc1x89AB, vbicq_s32(vacc1x89AB, vzero_shift_mask), 31);
+    vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+    vacc2x89AB = vsraq_n_s32(vacc2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31);
+    vacc2xCDEF = vsraq_n_s32(vacc2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_shift);
+    vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+    vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_shift);
+    vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), voutput_zero_point);
+    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+    int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
+    int8x16_t vout2x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc2x89ABCDEF);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+    int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
+    int8x16_t vout2x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc2x89ABCDEF));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout2x0123456789ABCDEF = vmaxq_s8(vout2x0123456789ABCDEF, voutput_min);
+    vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
+    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+
+    vout2x0123456789ABCDEF = vminq_s8(vout2x0123456789ABCDEF, voutput_max);
+    vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
+    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+
+    if (nc >= 16) {
+      vst1q_s8(c2 + 0, vout2x0123456789ABCDEF);
+      vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
+      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 16;
+    } else {
+      int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
+      int8x8_t vout2x01234567 = vget_low_s8(vout2x0123456789ABCDEF);
+      if (nc & 8) {
+        vst1_s8(c2, vout2x01234567); c2 += 8;
+        vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
+        vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
+        vout2x01234567 = vget_high_s8(vout2x0123456789ABCDEF);
+        vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
+      }
+      if (nc & 4) {
+        vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1_lane_s8(c2, vout2x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/3x16c8-minmax-neon-mull-padal.c b/src/qs8-igemm/gen/3x16c8-minmax-neon-mull-padal.c
new file mode 100644
index 0000000..db7aef0
--- /dev/null
+++ b/src/qs8-igemm/gen/3x16c8-minmax-neon-mull-padal.c
@@ -0,0 +1,492 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/c8-neon-mull-padal.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (3 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+
+  do {
+    int32x4_t vacc0x0 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x1 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x2 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x3 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x4 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x5 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x8 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x9 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x10 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x11 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x12 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x13 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x14 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x15 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc1x0 = vacc0x0;
+    int32x4_t vacc1x1 = vacc0x1;
+    int32x4_t vacc1x2 = vacc0x2;
+    int32x4_t vacc1x3 = vacc0x3;
+    int32x4_t vacc1x4 = vacc0x4;
+    int32x4_t vacc1x5 = vacc0x5;
+    int32x4_t vacc1x6 = vacc0x6;
+    int32x4_t vacc1x7 = vacc0x7;
+    int32x4_t vacc1x8 = vacc0x8;
+    int32x4_t vacc1x9 = vacc0x9;
+    int32x4_t vacc1x10 = vacc0x10;
+    int32x4_t vacc1x11 = vacc0x11;
+    int32x4_t vacc1x12 = vacc0x12;
+    int32x4_t vacc1x13 = vacc0x13;
+    int32x4_t vacc1x14 = vacc0x14;
+    int32x4_t vacc1x15 = vacc0x15;
+    int32x4_t vacc2x0 = vacc0x0;
+    int32x4_t vacc2x1 = vacc0x1;
+    int32x4_t vacc2x2 = vacc0x2;
+    int32x4_t vacc2x3 = vacc0x3;
+    int32x4_t vacc2x4 = vacc0x4;
+    int32x4_t vacc2x5 = vacc0x5;
+    int32x4_t vacc2x6 = vacc0x6;
+    int32x4_t vacc2x7 = vacc0x7;
+    int32x4_t vacc2x8 = vacc0x8;
+    int32x4_t vacc2x9 = vacc0x9;
+    int32x4_t vacc2x10 = vacc0x10;
+    int32x4_t vacc2x11 = vacc0x11;
+    int32x4_t vacc2x12 = vacc0x12;
+    int32x4_t vacc2x13 = vacc0x13;
+    int32x4_t vacc2x14 = vacc0x14;
+    int32x4_t vacc2x15 = vacc0x15;
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      a += 3;
+
+      size_t k = kc;
+
+      // Handle 8 bytes at a time using MUL.
+      while (k > 0) {
+        const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+        const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+
+        const int8x8_t vb0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x0 = vmull_s8(vb0, va0);
+        const int16x8_t vprod1x0 = vmull_s8(vb0, va1);
+        const int16x8_t vprod2x0 = vmull_s8(vb0, va2);
+        vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+        vacc1x0 = vpadalq_s16(vacc1x0, vprod1x0);
+        vacc2x0 = vpadalq_s16(vacc2x0, vprod2x0);
+        const int8x8_t vb1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x1 = vmull_s8(vb1, va0);
+        const int16x8_t vprod1x1 = vmull_s8(vb1, va1);
+        const int16x8_t vprod2x1 = vmull_s8(vb1, va2);
+        vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+        vacc1x1 = vpadalq_s16(vacc1x1, vprod1x1);
+        vacc2x1 = vpadalq_s16(vacc2x1, vprod2x1);
+        const int8x8_t vb2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x2 = vmull_s8(vb2, va0);
+        const int16x8_t vprod1x2 = vmull_s8(vb2, va1);
+        const int16x8_t vprod2x2 = vmull_s8(vb2, va2);
+        vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+        vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2);
+        vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2);
+        const int8x8_t vb3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x3 = vmull_s8(vb3, va0);
+        const int16x8_t vprod1x3 = vmull_s8(vb3, va1);
+        const int16x8_t vprod2x3 = vmull_s8(vb3, va2);
+        vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+        vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3);
+        vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3);
+        const int8x8_t vb4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x4 = vmull_s8(vb4, va0);
+        const int16x8_t vprod1x4 = vmull_s8(vb4, va1);
+        const int16x8_t vprod2x4 = vmull_s8(vb4, va2);
+        vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+        vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4);
+        vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4);
+        const int8x8_t vb5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x5 = vmull_s8(vb5, va0);
+        const int16x8_t vprod1x5 = vmull_s8(vb5, va1);
+        const int16x8_t vprod2x5 = vmull_s8(vb5, va2);
+        vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+        vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5);
+        vacc2x5 = vpadalq_s16(vacc2x5, vprod2x5);
+        const int8x8_t vb6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x6 = vmull_s8(vb6, va0);
+        const int16x8_t vprod1x6 = vmull_s8(vb6, va1);
+        const int16x8_t vprod2x6 = vmull_s8(vb6, va2);
+        vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+        vacc1x6 = vpadalq_s16(vacc1x6, vprod1x6);
+        vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6);
+        const int8x8_t vb7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x7 = vmull_s8(vb7, va0);
+        const int16x8_t vprod1x7 = vmull_s8(vb7, va1);
+        const int16x8_t vprod2x7 = vmull_s8(vb7, va2);
+        vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+        vacc1x7 = vpadalq_s16(vacc1x7, vprod1x7);
+        vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7);
+        const int8x8_t vb8 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x8 = vmull_s8(vb8, va0);
+        const int16x8_t vprod1x8 = vmull_s8(vb8, va1);
+        const int16x8_t vprod2x8 = vmull_s8(vb8, va2);
+        vacc0x8 = vpadalq_s16(vacc0x8, vprod0x8);
+        vacc1x8 = vpadalq_s16(vacc1x8, vprod1x8);
+        vacc2x8 = vpadalq_s16(vacc2x8, vprod2x8);
+        const int8x8_t vb9 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x9 = vmull_s8(vb9, va0);
+        const int16x8_t vprod1x9 = vmull_s8(vb9, va1);
+        const int16x8_t vprod2x9 = vmull_s8(vb9, va2);
+        vacc0x9 = vpadalq_s16(vacc0x9, vprod0x9);
+        vacc1x9 = vpadalq_s16(vacc1x9, vprod1x9);
+        vacc2x9 = vpadalq_s16(vacc2x9, vprod2x9);
+        const int8x8_t vb10 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x10 = vmull_s8(vb10, va0);
+        const int16x8_t vprod1x10 = vmull_s8(vb10, va1);
+        const int16x8_t vprod2x10 = vmull_s8(vb10, va2);
+        vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10);
+        vacc1x10 = vpadalq_s16(vacc1x10, vprod1x10);
+        vacc2x10 = vpadalq_s16(vacc2x10, vprod2x10);
+        const int8x8_t vb11 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x11 = vmull_s8(vb11, va0);
+        const int16x8_t vprod1x11 = vmull_s8(vb11, va1);
+        const int16x8_t vprod2x11 = vmull_s8(vb11, va2);
+        vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11);
+        vacc1x11 = vpadalq_s16(vacc1x11, vprod1x11);
+        vacc2x11 = vpadalq_s16(vacc2x11, vprod2x11);
+        const int8x8_t vb12 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x12 = vmull_s8(vb12, va0);
+        const int16x8_t vprod1x12 = vmull_s8(vb12, va1);
+        const int16x8_t vprod2x12 = vmull_s8(vb12, va2);
+        vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12);
+        vacc1x12 = vpadalq_s16(vacc1x12, vprod1x12);
+        vacc2x12 = vpadalq_s16(vacc2x12, vprod2x12);
+        const int8x8_t vb13 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x13 = vmull_s8(vb13, va0);
+        const int16x8_t vprod1x13 = vmull_s8(vb13, va1);
+        const int16x8_t vprod2x13 = vmull_s8(vb13, va2);
+        vacc0x13 = vpadalq_s16(vacc0x13, vprod0x13);
+        vacc1x13 = vpadalq_s16(vacc1x13, vprod1x13);
+        vacc2x13 = vpadalq_s16(vacc2x13, vprod2x13);
+        const int8x8_t vb14 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x14 = vmull_s8(vb14, va0);
+        const int16x8_t vprod1x14 = vmull_s8(vb14, va1);
+        const int16x8_t vprod2x14 = vmull_s8(vb14, va2);
+        vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14);
+        vacc1x14 = vpadalq_s16(vacc1x14, vprod1x14);
+        vacc2x14 = vpadalq_s16(vacc2x14, vprod2x14);
+        const int8x8_t vb15 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x15 = vmull_s8(vb15, va0);
+        const int16x8_t vprod1x15 = vmull_s8(vb15, va1);
+        const int16x8_t vprod2x15 = vmull_s8(vb15, va2);
+        vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15);
+        vacc1x15 = vpadalq_s16(vacc1x15, vprod1x15);
+        vacc2x15 = vpadalq_s16(vacc2x15, vprod2x15);
+
+        k -= 8 * sizeof(int8_t);
+      }
+
+      p -= 3 * sizeof(void*);
+    } while (p != 0);
+
+#if XNN_ARCH_ARM64
+    const int32x4_t vsum0x01 = vpaddq_s32(vacc0x0, vacc0x1);
+    const int32x4_t vsum0x23 = vpaddq_s32(vacc0x2, vacc0x3);
+    const int32x4_t vsum0x45 = vpaddq_s32(vacc0x4, vacc0x5);
+    const int32x4_t vsum0x67 = vpaddq_s32(vacc0x6, vacc0x7);
+    const int32x4_t vsum0x89 = vpaddq_s32(vacc0x8, vacc0x9);
+    const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11);
+    const int32x4_t vsum0xCD = vpaddq_s32(vacc0x12, vacc0x13);
+    const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15);
+    const int32x4_t vsum1x01 = vpaddq_s32(vacc1x0, vacc1x1);
+    const int32x4_t vsum1x23 = vpaddq_s32(vacc1x2, vacc1x3);
+    const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5);
+    const int32x4_t vsum1x67 = vpaddq_s32(vacc1x6, vacc1x7);
+    const int32x4_t vsum1x89 = vpaddq_s32(vacc1x8, vacc1x9);
+    const int32x4_t vsum1xAB = vpaddq_s32(vacc1x10, vacc1x11);
+    const int32x4_t vsum1xCD = vpaddq_s32(vacc1x12, vacc1x13);
+    const int32x4_t vsum1xEF = vpaddq_s32(vacc1x14, vacc1x15);
+    const int32x4_t vsum2x01 = vpaddq_s32(vacc2x0, vacc2x1);
+    const int32x4_t vsum2x23 = vpaddq_s32(vacc2x2, vacc2x3);
+    const int32x4_t vsum2x45 = vpaddq_s32(vacc2x4, vacc2x5);
+    const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7);
+    const int32x4_t vsum2x89 = vpaddq_s32(vacc2x8, vacc2x9);
+    const int32x4_t vsum2xAB = vpaddq_s32(vacc2x10, vacc2x11);
+    const int32x4_t vsum2xCD = vpaddq_s32(vacc2x12, vacc2x13);
+    const int32x4_t vsum2xEF = vpaddq_s32(vacc2x14, vacc2x15);
+    int32x4_t vacc0x0123 = vpaddq_s32(vsum0x01, vsum0x23);
+    int32x4_t vacc0x4567 = vpaddq_s32(vsum0x45, vsum0x67);
+    int32x4_t vacc0x89AB = vpaddq_s32(vsum0x89, vsum0xAB);
+    int32x4_t vacc0xCDEF = vpaddq_s32(vsum0xCD, vsum0xEF);
+    int32x4_t vacc1x0123 = vpaddq_s32(vsum1x01, vsum1x23);
+    int32x4_t vacc1x4567 = vpaddq_s32(vsum1x45, vsum1x67);
+    int32x4_t vacc1x89AB = vpaddq_s32(vsum1x89, vsum1xAB);
+    int32x4_t vacc1xCDEF = vpaddq_s32(vsum1xCD, vsum1xEF);
+    int32x4_t vacc2x0123 = vpaddq_s32(vsum2x01, vsum2x23);
+    int32x4_t vacc2x4567 = vpaddq_s32(vsum2x45, vsum2x67);
+    int32x4_t vacc2x89AB = vpaddq_s32(vsum2x89, vsum2xAB);
+    int32x4_t vacc2xCDEF = vpaddq_s32(vsum2xCD, vsum2xEF);
+#else
+    const int32x2_t vpsum0x0 = vadd_s32(vget_low_s32(vacc0x0), vget_high_s32(vacc0x0));
+    const int32x2_t vpsum0x1 = vadd_s32(vget_low_s32(vacc0x1), vget_high_s32(vacc0x1));
+    const int32x2_t vpsum0x2 = vadd_s32(vget_low_s32(vacc0x2), vget_high_s32(vacc0x2));
+    const int32x2_t vpsum0x3 = vadd_s32(vget_low_s32(vacc0x3), vget_high_s32(vacc0x3));
+    const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1);
+    const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3);
+    int32x4_t vacc0x0123 = vcombine_s32(vsum0x01, vsum0x23 );
+    const int32x2_t vpsum0x4 = vadd_s32(vget_low_s32(vacc0x4), vget_high_s32(vacc0x4));
+    const int32x2_t vpsum0x5 = vadd_s32(vget_low_s32(vacc0x5), vget_high_s32(vacc0x5));
+    const int32x2_t vpsum0x6 = vadd_s32(vget_low_s32(vacc0x6), vget_high_s32(vacc0x6));
+    const int32x2_t vpsum0x7 = vadd_s32(vget_low_s32(vacc0x7), vget_high_s32(vacc0x7));
+    const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5);
+    const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7);
+    int32x4_t vacc0x4567 = vcombine_s32(vsum0x45, vsum0x67 );
+    const int32x2_t vpsum0x8 = vadd_s32(vget_low_s32(vacc0x8), vget_high_s32(vacc0x8));
+    const int32x2_t vpsum0x9 = vadd_s32(vget_low_s32(vacc0x9), vget_high_s32(vacc0x9));
+    const int32x2_t vpsum0xA = vadd_s32(vget_low_s32(vacc0x10), vget_high_s32(vacc0x10));
+    const int32x2_t vpsum0xB = vadd_s32(vget_low_s32(vacc0x11), vget_high_s32(vacc0x11));
+    const int32x2_t vsum0x89 = vpadd_s32(vpsum0x8, vpsum0x9);
+    const int32x2_t vsum0xAB = vpadd_s32(vpsum0xA, vpsum0xB);
+    int32x4_t vacc0x89AB = vcombine_s32(vsum0x89, vsum0xAB );
+    const int32x2_t vpsum0xC = vadd_s32(vget_low_s32(vacc0x12), vget_high_s32(vacc0x12));
+    const int32x2_t vpsum0xD = vadd_s32(vget_low_s32(vacc0x13), vget_high_s32(vacc0x13));
+    const int32x2_t vpsum0xE = vadd_s32(vget_low_s32(vacc0x14), vget_high_s32(vacc0x14));
+    const int32x2_t vpsum0xF = vadd_s32(vget_low_s32(vacc0x15), vget_high_s32(vacc0x15));
+    const int32x2_t vsum0xCD = vpadd_s32(vpsum0xC, vpsum0xD);
+    const int32x2_t vsum0xEF = vpadd_s32(vpsum0xE, vpsum0xF);
+    int32x4_t vacc0xCDEF = vcombine_s32(vsum0xCD, vsum0xEF );
+    const int32x2_t vpsum1x0 = vadd_s32(vget_low_s32(vacc1x0), vget_high_s32(vacc1x0));
+    const int32x2_t vpsum1x1 = vadd_s32(vget_low_s32(vacc1x1), vget_high_s32(vacc1x1));
+    const int32x2_t vpsum1x2 = vadd_s32(vget_low_s32(vacc1x2), vget_high_s32(vacc1x2));
+    const int32x2_t vpsum1x3 = vadd_s32(vget_low_s32(vacc1x3), vget_high_s32(vacc1x3));
+    const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1);
+    const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3);
+    int32x4_t vacc1x0123 = vcombine_s32(vsum1x01, vsum1x23 );
+    const int32x2_t vpsum1x4 = vadd_s32(vget_low_s32(vacc1x4), vget_high_s32(vacc1x4));
+    const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5));
+    const int32x2_t vpsum1x6 = vadd_s32(vget_low_s32(vacc1x6), vget_high_s32(vacc1x6));
+    const int32x2_t vpsum1x7 = vadd_s32(vget_low_s32(vacc1x7), vget_high_s32(vacc1x7));
+    const int32x2_t vsum1x45 = vpadd_s32(vpsum1x4, vpsum1x5);
+    const int32x2_t vsum1x67 = vpadd_s32(vpsum1x6, vpsum1x7);
+    int32x4_t vacc1x4567 = vcombine_s32(vsum1x45, vsum1x67 );
+    const int32x2_t vpsum1x8 = vadd_s32(vget_low_s32(vacc1x8), vget_high_s32(vacc1x8));
+    const int32x2_t vpsum1x9 = vadd_s32(vget_low_s32(vacc1x9), vget_high_s32(vacc1x9));
+    const int32x2_t vpsum1xA = vadd_s32(vget_low_s32(vacc1x10), vget_high_s32(vacc1x10));
+    const int32x2_t vpsum1xB = vadd_s32(vget_low_s32(vacc1x11), vget_high_s32(vacc1x11));
+    const int32x2_t vsum1x89 = vpadd_s32(vpsum1x8, vpsum1x9);
+    const int32x2_t vsum1xAB = vpadd_s32(vpsum1xA, vpsum1xB);
+    int32x4_t vacc1x89AB = vcombine_s32(vsum1x89, vsum1xAB );
+    const int32x2_t vpsum1xC = vadd_s32(vget_low_s32(vacc1x12), vget_high_s32(vacc1x12));
+    const int32x2_t vpsum1xD = vadd_s32(vget_low_s32(vacc1x13), vget_high_s32(vacc1x13));
+    const int32x2_t vpsum1xE = vadd_s32(vget_low_s32(vacc1x14), vget_high_s32(vacc1x14));
+    const int32x2_t vpsum1xF = vadd_s32(vget_low_s32(vacc1x15), vget_high_s32(vacc1x15));
+    const int32x2_t vsum1xCD = vpadd_s32(vpsum1xC, vpsum1xD);
+    const int32x2_t vsum1xEF = vpadd_s32(vpsum1xE, vpsum1xF);
+    int32x4_t vacc1xCDEF = vcombine_s32(vsum1xCD, vsum1xEF );
+    const int32x2_t vpsum2x0 = vadd_s32(vget_low_s32(vacc2x0), vget_high_s32(vacc2x0));
+    const int32x2_t vpsum2x1 = vadd_s32(vget_low_s32(vacc2x1), vget_high_s32(vacc2x1));
+    const int32x2_t vpsum2x2 = vadd_s32(vget_low_s32(vacc2x2), vget_high_s32(vacc2x2));
+    const int32x2_t vpsum2x3 = vadd_s32(vget_low_s32(vacc2x3), vget_high_s32(vacc2x3));
+    const int32x2_t vsum2x01 = vpadd_s32(vpsum2x0, vpsum2x1);
+    const int32x2_t vsum2x23 = vpadd_s32(vpsum2x2, vpsum2x3);
+    int32x4_t vacc2x0123 = vcombine_s32(vsum2x01, vsum2x23 );
+    const int32x2_t vpsum2x4 = vadd_s32(vget_low_s32(vacc2x4), vget_high_s32(vacc2x4));
+    const int32x2_t vpsum2x5 = vadd_s32(vget_low_s32(vacc2x5), vget_high_s32(vacc2x5));
+    const int32x2_t vpsum2x6 = vadd_s32(vget_low_s32(vacc2x6), vget_high_s32(vacc2x6));
+    const int32x2_t vpsum2x7 = vadd_s32(vget_low_s32(vacc2x7), vget_high_s32(vacc2x7));
+    const int32x2_t vsum2x45 = vpadd_s32(vpsum2x4, vpsum2x5);
+    const int32x2_t vsum2x67 = vpadd_s32(vpsum2x6, vpsum2x7);
+    int32x4_t vacc2x4567 = vcombine_s32(vsum2x45, vsum2x67 );
+    const int32x2_t vpsum2x8 = vadd_s32(vget_low_s32(vacc2x8), vget_high_s32(vacc2x8));
+    const int32x2_t vpsum2x9 = vadd_s32(vget_low_s32(vacc2x9), vget_high_s32(vacc2x9));
+    const int32x2_t vpsum2xA = vadd_s32(vget_low_s32(vacc2x10), vget_high_s32(vacc2x10));
+    const int32x2_t vpsum2xB = vadd_s32(vget_low_s32(vacc2x11), vget_high_s32(vacc2x11));
+    const int32x2_t vsum2x89 = vpadd_s32(vpsum2x8, vpsum2x9);
+    const int32x2_t vsum2xAB = vpadd_s32(vpsum2xA, vpsum2xB);
+    int32x4_t vacc2x89AB = vcombine_s32(vsum2x89, vsum2xAB );
+    const int32x2_t vpsum2xC = vadd_s32(vget_low_s32(vacc2x12), vget_high_s32(vacc2x12));
+    const int32x2_t vpsum2xD = vadd_s32(vget_low_s32(vacc2x13), vget_high_s32(vacc2x13));
+    const int32x2_t vpsum2xE = vadd_s32(vget_low_s32(vacc2x14), vget_high_s32(vacc2x14));
+    const int32x2_t vpsum2xF = vadd_s32(vget_low_s32(vacc2x15), vget_high_s32(vacc2x15));
+    const int32x2_t vsum2xCD = vpadd_s32(vpsum2xC, vpsum2xD);
+    const int32x2_t vsum2xEF = vpadd_s32(vpsum2xE, vpsum2xF);
+    int32x4_t vacc2xCDEF = vcombine_s32(vsum2xCD, vsum2xEF );
+#endif
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+    vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc1x89AB = vqrdmulhq_s32(vacc1x89AB, vmultiplier);
+    vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+    vacc2x89AB = vqrdmulhq_s32(vacc2x89AB, vmultiplier);
+    vacc2xCDEF = vqrdmulhq_s32(vacc2xCDEF, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+    vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc1x89AB = vsraq_n_s32(vacc1x89AB, vbicq_s32(vacc1x89AB, vzero_shift_mask), 31);
+    vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+    vacc2x89AB = vsraq_n_s32(vacc2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31);
+    vacc2xCDEF = vsraq_n_s32(vacc2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_shift);
+    vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+    vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_shift);
+    vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), voutput_zero_point);
+    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+    int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
+    int8x16_t vout2x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc2x89ABCDEF);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+    int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
+    int8x16_t vout2x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc2x89ABCDEF));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout2x0123456789ABCDEF = vmaxq_s8(vout2x0123456789ABCDEF, voutput_min);
+    vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
+    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+
+    vout2x0123456789ABCDEF = vminq_s8(vout2x0123456789ABCDEF, voutput_max);
+    vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
+    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+
+    if (nc >= 16) {
+      vst1q_s8(c2 + 0, vout2x0123456789ABCDEF);
+      vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
+      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 16;
+    } else {
+      int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
+      int8x8_t vout2x01234567 = vget_low_s8(vout2x0123456789ABCDEF);
+      if (nc & 8) {
+        vst1_s8(c2, vout2x01234567); c2 += 8;
+        vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
+        vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
+        vout2x01234567 = vget_high_s8(vout2x0123456789ABCDEF);
+        vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
+      }
+      if (nc & 4) {
+        vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1_lane_s8(c2, vout2x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-sse2-ld128.c b/src/qs8-igemm/gen/3x4c8-minmax-sse2-ld128.c
index dd940c0..47dba04 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-sse2-ld128.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-sse2-ld128.c
@@ -12,6 +12,7 @@
 #include <emmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_3x4c8__sse2_ld128(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-sse2-ld64.c b/src/qs8-igemm/gen/3x4c8-minmax-sse2-ld64.c
index 4904a1c..eaffe04 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-sse2-ld64.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-sse2-ld64.c
@@ -12,6 +12,7 @@
 #include <emmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_3x4c8__sse2_ld64(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-sse41-ld128.c b/src/qs8-igemm/gen/3x4c8-minmax-sse41-ld128.c
index 14d7f8c..b4c8d56 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-sse41-ld128.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-sse41-ld128.c
@@ -12,6 +12,7 @@
 #include <smmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-sse41-ld64.c b/src/qs8-igemm/gen/3x4c8-minmax-sse41-ld64.c
index 5450387..6092d91 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-sse41-ld64.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-sse41-ld64.c
@@ -12,6 +12,7 @@
 #include <smmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-ssse3-ld128.c b/src/qs8-igemm/gen/3x4c8-minmax-ssse3-ld128.c
index 3f3aefa..f428f76 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-ssse3-ld128.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-ssse3-ld128.c
@@ -12,6 +12,7 @@
 #include <tmmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_3x4c8__ssse3_ld128(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-ssse3-ld64.c b/src/qs8-igemm/gen/3x4c8-minmax-ssse3-ld64.c
index 8cca813..141be70 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-ssse3-ld64.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-ssse3-ld64.c
@@ -12,6 +12,7 @@
 #include <tmmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_3x4c8__ssse3_ld64(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-wasmsimd-ld128.c b/src/qs8-igemm/gen/3x4c8-minmax-wasmsimd-ld128.c
index 0c95932..f232658 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-wasmsimd-ld128.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-wasmsimd-ld128.c
@@ -12,6 +12,7 @@
 #include <wasm_simd128.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_3x4c8__wasmsimd_ld128(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-wasmsimd-ld64.c b/src/qs8-igemm/gen/3x4c8-minmax-wasmsimd-ld64.c
index 7f22e3d..1b9efcc 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-wasmsimd-ld64.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-wasmsimd-ld64.c
@@ -12,6 +12,7 @@
 #include <wasm_simd128.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_3x4c8__wasmsimd_ld64(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-xop-ld128.c b/src/qs8-igemm/gen/3x4c8-minmax-xop-ld128.c
index 1760fc0..4adc8c1 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-xop-ld128.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-xop-ld128.c
@@ -17,6 +17,7 @@
 #endif
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128(
@@ -44,6 +45,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-xop-ld64.c b/src/qs8-igemm/gen/3x4c8-minmax-xop-ld64.c
index c215aa9..6bb8aaa 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-xop-ld64.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-xop-ld64.c
@@ -17,6 +17,7 @@
 #endif
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64(
@@ -44,6 +45,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
diff --git a/src/qs8-igemm/gen/3x8-minmax-neon-mlal-lane.c b/src/qs8-igemm/gen/3x8-minmax-neon-mlal-lane.c
new file mode 100644
index 0000000..ac4a426
--- /dev/null
+++ b/src/qs8-igemm/gen/3x8-minmax-neon-mlal-lane.c
@@ -0,0 +1,340 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/neon-mlal-lane.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/igemm.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (3 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc1x0123 = vacc0x0123;
+    int32x4_t vacc1x4567 = vacc0x4567;
+    int32x4_t vacc2x0123 = vacc0x0123;
+    int32x4_t vacc2x4567 = vacc0x4567;
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      a += 3;
+
+      size_t k = kc;
+      while (k >= 8 * sizeof(int8_t)) {
+        const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+        const int16x8_t vxa0 = vmovl_s8(va0);
+        const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+        const int16x8_t vxa1 = vmovl_s8(va1);
+        const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+        const int16x8_t vxa2 = vmovl_s8(va2);
+
+        const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb01234567c0 = vmovl_s8(vb01234567c0);
+
+        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
+        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
+        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
+        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
+        const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb01234567c1 = vmovl_s8(vb01234567c1);
+
+        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
+        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
+        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
+        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
+        const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb01234567c2 = vmovl_s8(vb01234567c2);
+
+        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
+        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
+        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
+        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
+        const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb01234567c3 = vmovl_s8(vb01234567c3);
+
+        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
+        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
+        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
+        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
+
+        const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb01234567c4 = vmovl_s8(vb01234567c4);
+
+        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
+        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
+        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
+        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
+        const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb01234567c5 = vmovl_s8(vb01234567c5);
+
+        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
+        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
+        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
+        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
+        const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb01234567c6 = vmovl_s8(vb01234567c6);
+
+        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
+        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
+        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
+        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
+        const int8x8_t vb01234567c7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb01234567c7 = vmovl_s8(vb01234567c7);
+
+        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
+        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
+        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa1), 3);
+        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa1), 3);
+        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa2), 3);
+        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa2), 3);
+
+        k -= 8 * sizeof(int8_t);
+      }
+      if XNN_UNLIKELY(k != 0) {
+        const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+        const int16x8_t vxa0 = vmovl_s8(va0);
+        const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+        const int16x8_t vxa1 = vmovl_s8(va1);
+        const int8x8_t va2 = vld1_s8(a2); a2 = (const int8_t*) ((uintptr_t) a2 + k);
+        const int16x8_t vxa2 = vmovl_s8(va2);
+
+        const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb01234567c0 = vmovl_s8(vb01234567c0);
+
+        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
+        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
+        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
+        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
+
+        if (k >= 2 * sizeof(int8_t)) {
+          const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int16x8_t vxb01234567c1 = vmovl_s8(vb01234567c1);
+
+          vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+          vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+          vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
+          vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
+          vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
+          vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
+
+          if (k > 2 * sizeof(int8_t)) {
+            const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            const int16x8_t vxb01234567c2 = vmovl_s8(vb01234567c2);
+
+            vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+            vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+            vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
+            vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
+            vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
+            vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
+
+            if (k >= 4 * sizeof(int8_t)) {
+              const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+              const int16x8_t vxb01234567c3 = vmovl_s8(vb01234567c3);
+
+              vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+              vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+              vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
+              vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
+              vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
+              vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
+
+              if (k > 4 * sizeof(int8_t)) {
+                const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+                const int16x8_t vxb01234567c4 = vmovl_s8(vb01234567c4);
+
+                vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+                vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+                vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
+                vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
+                vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
+                vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
+
+                if (k >= 6 * sizeof(int8_t)) {
+                  const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+                  const int16x8_t vxb01234567c5 = vmovl_s8(vb01234567c5);
+
+                  vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+                  vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+                  vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
+                  vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
+                  vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
+                  vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
+
+                  if (k > 6 * sizeof(int8_t)) {
+                    const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+                    const int16x8_t vxb01234567c6 = vmovl_s8(vb01234567c6);
+
+                    vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+                    vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+                    vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
+                    vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
+                    vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
+                    vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+      p -= 3 * sizeof(void*);
+    } while (p != 0);
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
+    int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
+    int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567);
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout2x01234567 = vmax_s8(vout2x01234567, vget_low_s8(voutput_min));
+    vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);
+
+    vout2x01234567 = vmin_s8(vout2x01234567, vget_low_s8(voutput_max));
+    vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);
+
+    if (nc >= 8) {
+      vst1_s8(c2 + 0, vout2x01234567);
+      vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
+      vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
+
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1_lane_s8(c2, vout2x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/3x8-minmax-neon-mull-addw-dup.c b/src/qs8-igemm/gen/3x8-minmax-neon-mull-addw-dup.c
new file mode 100644
index 0000000..5091523
--- /dev/null
+++ b/src/qs8-igemm/gen/3x8-minmax-neon-mull-addw-dup.c
@@ -0,0 +1,363 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/neon-mull-addw-dup.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/gemm.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (3 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc1x0123 = vacc0x0123;
+    int32x4_t vacc1x4567 = vacc0x4567;
+    int32x4_t vacc2x0123 = vacc0x0123;
+    int32x4_t vacc2x4567 = vacc0x4567;
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      a += 3;
+
+      size_t k = kc;
+      while (k >= 8 * sizeof(int8_t)) {
+        const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+        const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+
+        const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c0));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c0));
+        const int16x8_t vprod1x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va1, 0));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c0));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c0));
+        const int16x8_t vprod2x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va2, 0));
+        vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c0));
+        vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c0));
+        const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c1));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c1));
+        const int16x8_t vprod1x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va1, 1));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c1));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c1));
+        const int16x8_t vprod2x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va2, 1));
+        vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c1));
+        vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c1));
+        const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va0, 2));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c2));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c2));
+        const int16x8_t vprod1x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va1, 2));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c2));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c2));
+        const int16x8_t vprod2x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va2, 2));
+        vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c2));
+        vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c2));
+        const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3));
+        const int16x8_t vprod1x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va1, 3));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c3));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c3));
+        const int16x8_t vprod2x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va2, 3));
+        vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c3));
+        vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c3));
+        const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va0, 4));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c4));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c4));
+        const int16x8_t vprod1x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va1, 4));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c4));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c4));
+        const int16x8_t vprod2x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va2, 4));
+        vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c4));
+        vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c4));
+        const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va0, 5));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c5));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c5));
+        const int16x8_t vprod1x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va1, 5));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c5));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c5));
+        const int16x8_t vprod2x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va2, 5));
+        vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c5));
+        vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c5));
+        const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va0, 6));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c6));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c6));
+        const int16x8_t vprod1x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va1, 6));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c6));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c6));
+        const int16x8_t vprod2x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va2, 6));
+        vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c6));
+        vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c6));
+        const int8x8_t vb01234567c7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va0, 7));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c7));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c7));
+        const int16x8_t vprod1x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va1, 7));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c7));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c7));
+        const int16x8_t vprod2x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va2, 7));
+        vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c7));
+        vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c7));
+
+        k -= 8 * sizeof(int8_t);
+      }
+      if XNN_UNLIKELY(k != 0) {
+        const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+        const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+        const int8x8_t va2 = vld1_s8(a2); a2 = (const int8_t*) ((uintptr_t) a2 + k);
+
+        const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c0));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c0));
+        const int16x8_t vprod1x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va1, 0));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c0));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c0));
+        const int16x8_t vprod2x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va2, 0));
+        vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c0));
+        vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c0));
+
+        if (k >= 2 * sizeof(int8_t)) {
+          const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1));
+          vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c1));
+          vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c1));
+          const int16x8_t vprod1x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va1, 1));
+          vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c1));
+          vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c1));
+          const int16x8_t vprod2x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va2, 1));
+          vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c1));
+          vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c1));
+
+          if (k > 2 * sizeof(int8_t)) {
+            const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+            const int16x8_t vprod0x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va0, 2));
+            vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c2));
+            vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c2));
+            const int16x8_t vprod1x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va1, 2));
+            vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c2));
+            vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c2));
+            const int16x8_t vprod2x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va2, 2));
+            vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c2));
+            vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c2));
+
+            if (k >= 4 * sizeof(int8_t)) {
+              const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+              const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3));
+              vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3));
+              vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3));
+              const int16x8_t vprod1x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va1, 3));
+              vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c3));
+              vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c3));
+              const int16x8_t vprod2x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va2, 3));
+              vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c3));
+              vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c3));
+
+              if (k > 4 * sizeof(int8_t)) {
+                const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+                const int16x8_t vprod0x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va0, 4));
+                vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c4));
+                vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c4));
+                const int16x8_t vprod1x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va1, 4));
+                vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c4));
+                vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c4));
+                const int16x8_t vprod2x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va2, 4));
+                vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c4));
+                vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c4));
+
+                if (k >= 6 * sizeof(int8_t)) {
+                  const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+                  const int16x8_t vprod0x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va0, 5));
+                  vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c5));
+                  vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c5));
+                  const int16x8_t vprod1x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va1, 5));
+                  vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c5));
+                  vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c5));
+                  const int16x8_t vprod2x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va2, 5));
+                  vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c5));
+                  vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c5));
+
+                  if (k > 6 * sizeof(int8_t)) {
+                    const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+                    const int16x8_t vprod0x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va0, 6));
+                    vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c6));
+                    vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c6));
+                    const int16x8_t vprod1x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va1, 6));
+                    vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c6));
+                    vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c6));
+                    const int16x8_t vprod2x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va2, 6));
+                    vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c6));
+                    vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c6));
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+      p -= 3 * sizeof(void*);
+    } while (p != 0);
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
+    int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
+    int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567);
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout2x01234567 = vmax_s8(vout2x01234567, vget_low_s8(voutput_min));
+    vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);
+
+    vout2x01234567 = vmin_s8(vout2x01234567, vget_low_s8(voutput_max));
+    vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);
+
+    if (nc >= 8) {
+      vst1_s8(c2 + 0, vout2x01234567);
+      vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
+      vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
+
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1_lane_s8(c2, vout2x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/3x8c16-minmax-neon-mlal-padal.c b/src/qs8-igemm/gen/3x8c16-minmax-neon-mlal-padal.c
new file mode 100644
index 0000000..626dfa4
--- /dev/null
+++ b/src/qs8-igemm/gen/3x8c16-minmax-neon-mlal-padal.c
@@ -0,0 +1,339 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/c16-neon-mlal-padal.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (3 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 16);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+
+  do {
+    int32x4_t vacc0x0 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x1 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x2 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x3 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x4 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x5 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc1x0 = vacc0x0;
+    int32x4_t vacc1x1 = vacc0x1;
+    int32x4_t vacc1x2 = vacc0x2;
+    int32x4_t vacc1x3 = vacc0x3;
+    int32x4_t vacc1x4 = vacc0x4;
+    int32x4_t vacc1x5 = vacc0x5;
+    int32x4_t vacc1x6 = vacc0x6;
+    int32x4_t vacc1x7 = vacc0x7;
+    int32x4_t vacc2x0 = vacc0x0;
+    int32x4_t vacc2x1 = vacc0x1;
+    int32x4_t vacc2x2 = vacc0x2;
+    int32x4_t vacc2x3 = vacc0x3;
+    int32x4_t vacc2x4 = vacc0x4;
+    int32x4_t vacc2x5 = vacc0x5;
+    int32x4_t vacc2x6 = vacc0x6;
+    int32x4_t vacc2x7 = vacc0x7;
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      a += 3;
+
+      // KC loop of 16 with up to 15 remainder
+      size_t k = 0;
+      while (k < kc) {
+        const int8x16_t va0 = vld1q_s8(a0); a0 += 16;
+        const int8x16_t va1 = vld1q_s8(a1); a1 += 16;
+        const int8x16_t va2 = vld1q_s8(a2); a2 += 16;
+
+        const int8x16_t vb0 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb1 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb2 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb3 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb4 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb5 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb6 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb7 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+
+        int16x8_t vprod0x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va0));
+        int16x8_t vprod1x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va1));
+        int16x8_t vprod2x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va2));
+        vprod0x0 = vmlal_s8(vprod0x0, vget_high_s8(vb0), vget_high_s8(va0));
+        vprod1x0 = vmlal_s8(vprod1x0, vget_high_s8(vb0), vget_high_s8(va1));
+        vprod2x0 = vmlal_s8(vprod2x0, vget_high_s8(vb0), vget_high_s8(va2));
+        vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+        vacc1x0 = vpadalq_s16(vacc1x0, vprod1x0);
+        vacc2x0 = vpadalq_s16(vacc2x0, vprod2x0);
+        int16x8_t vprod0x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va0));
+        int16x8_t vprod1x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va1));
+        int16x8_t vprod2x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va2));
+        vprod0x1 = vmlal_s8(vprod0x1, vget_high_s8(vb1), vget_high_s8(va0));
+        vprod1x1 = vmlal_s8(vprod1x1, vget_high_s8(vb1), vget_high_s8(va1));
+        vprod2x1 = vmlal_s8(vprod2x1, vget_high_s8(vb1), vget_high_s8(va2));
+        vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+        vacc1x1 = vpadalq_s16(vacc1x1, vprod1x1);
+        vacc2x1 = vpadalq_s16(vacc2x1, vprod2x1);
+        int16x8_t vprod0x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va0));
+        int16x8_t vprod1x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va1));
+        int16x8_t vprod2x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va2));
+        vprod0x2 = vmlal_s8(vprod0x2, vget_high_s8(vb2), vget_high_s8(va0));
+        vprod1x2 = vmlal_s8(vprod1x2, vget_high_s8(vb2), vget_high_s8(va1));
+        vprod2x2 = vmlal_s8(vprod2x2, vget_high_s8(vb2), vget_high_s8(va2));
+        vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+        vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2);
+        vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2);
+        int16x8_t vprod0x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va0));
+        int16x8_t vprod1x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va1));
+        int16x8_t vprod2x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va2));
+        vprod0x3 = vmlal_s8(vprod0x3, vget_high_s8(vb3), vget_high_s8(va0));
+        vprod1x3 = vmlal_s8(vprod1x3, vget_high_s8(vb3), vget_high_s8(va1));
+        vprod2x3 = vmlal_s8(vprod2x3, vget_high_s8(vb3), vget_high_s8(va2));
+        vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+        vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3);
+        vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3);
+        int16x8_t vprod0x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va0));
+        int16x8_t vprod1x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va1));
+        int16x8_t vprod2x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va2));
+        vprod0x4 = vmlal_s8(vprod0x4, vget_high_s8(vb4), vget_high_s8(va0));
+        vprod1x4 = vmlal_s8(vprod1x4, vget_high_s8(vb4), vget_high_s8(va1));
+        vprod2x4 = vmlal_s8(vprod2x4, vget_high_s8(vb4), vget_high_s8(va2));
+        vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+        vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4);
+        vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4);
+        int16x8_t vprod0x5 = vmull_s8(vget_low_s8(vb5), vget_low_s8(va0));
+        int16x8_t vprod1x5 = vmull_s8(vget_low_s8(vb5), vget_low_s8(va1));
+        int16x8_t vprod2x5 = vmull_s8(vget_low_s8(vb5), vget_low_s8(va2));
+        vprod0x5 = vmlal_s8(vprod0x5, vget_high_s8(vb5), vget_high_s8(va0));
+        vprod1x5 = vmlal_s8(vprod1x5, vget_high_s8(vb5), vget_high_s8(va1));
+        vprod2x5 = vmlal_s8(vprod2x5, vget_high_s8(vb5), vget_high_s8(va2));
+        vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+        vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5);
+        vacc2x5 = vpadalq_s16(vacc2x5, vprod2x5);
+        int16x8_t vprod0x6 = vmull_s8(vget_low_s8(vb6), vget_low_s8(va0));
+        int16x8_t vprod1x6 = vmull_s8(vget_low_s8(vb6), vget_low_s8(va1));
+        int16x8_t vprod2x6 = vmull_s8(vget_low_s8(vb6), vget_low_s8(va2));
+        vprod0x6 = vmlal_s8(vprod0x6, vget_high_s8(vb6), vget_high_s8(va0));
+        vprod1x6 = vmlal_s8(vprod1x6, vget_high_s8(vb6), vget_high_s8(va1));
+        vprod2x6 = vmlal_s8(vprod2x6, vget_high_s8(vb6), vget_high_s8(va2));
+        vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+        vacc1x6 = vpadalq_s16(vacc1x6, vprod1x6);
+        vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6);
+        int16x8_t vprod0x7 = vmull_s8(vget_low_s8(vb7), vget_low_s8(va0));
+        int16x8_t vprod1x7 = vmull_s8(vget_low_s8(vb7), vget_low_s8(va1));
+        int16x8_t vprod2x7 = vmull_s8(vget_low_s8(vb7), vget_low_s8(va2));
+        vprod0x7 = vmlal_s8(vprod0x7, vget_high_s8(vb7), vget_high_s8(va0));
+        vprod1x7 = vmlal_s8(vprod1x7, vget_high_s8(vb7), vget_high_s8(va1));
+        vprod2x7 = vmlal_s8(vprod2x7, vget_high_s8(vb7), vget_high_s8(va2));
+        vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+        vacc1x7 = vpadalq_s16(vacc1x7, vprod1x7);
+        vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7);
+
+        k += 16 * sizeof(int8_t);
+      }
+
+      p -= 3 * sizeof(void*);
+    } while (p != 0);
+
+#if XNN_ARCH_ARM64
+    const int32x4_t vsum0x01 = vpaddq_s32(vacc0x0, vacc0x1);
+    const int32x4_t vsum0x23 = vpaddq_s32(vacc0x2, vacc0x3);
+    const int32x4_t vsum0x45 = vpaddq_s32(vacc0x4, vacc0x5);
+    const int32x4_t vsum0x67 = vpaddq_s32(vacc0x6, vacc0x7);
+    const int32x4_t vsum1x01 = vpaddq_s32(vacc1x0, vacc1x1);
+    const int32x4_t vsum1x23 = vpaddq_s32(vacc1x2, vacc1x3);
+    const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5);
+    const int32x4_t vsum1x67 = vpaddq_s32(vacc1x6, vacc1x7);
+    const int32x4_t vsum2x01 = vpaddq_s32(vacc2x0, vacc2x1);
+    const int32x4_t vsum2x23 = vpaddq_s32(vacc2x2, vacc2x3);
+    const int32x4_t vsum2x45 = vpaddq_s32(vacc2x4, vacc2x5);
+    const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7);
+    int32x4_t vacc0x0123 = vpaddq_s32(vsum0x01, vsum0x23);
+    int32x4_t vacc0x4567 = vpaddq_s32(vsum0x45, vsum0x67);
+    int32x4_t vacc1x0123 = vpaddq_s32(vsum1x01, vsum1x23);
+    int32x4_t vacc1x4567 = vpaddq_s32(vsum1x45, vsum1x67);
+    int32x4_t vacc2x0123 = vpaddq_s32(vsum2x01, vsum2x23);
+    int32x4_t vacc2x4567 = vpaddq_s32(vsum2x45, vsum2x67);
+#else
+    const int32x2_t vpsum0x0 = vadd_s32(vget_low_s32(vacc0x0), vget_high_s32(vacc0x0));
+    const int32x2_t vpsum0x1 = vadd_s32(vget_low_s32(vacc0x1), vget_high_s32(vacc0x1));
+    const int32x2_t vpsum0x2 = vadd_s32(vget_low_s32(vacc0x2), vget_high_s32(vacc0x2));
+    const int32x2_t vpsum0x3 = vadd_s32(vget_low_s32(vacc0x3), vget_high_s32(vacc0x3));
+    const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1);
+    const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3);
+    int32x4_t vacc0x0123 = vcombine_s32(vsum0x01, vsum0x23 );
+    const int32x2_t vpsum0x4 = vadd_s32(vget_low_s32(vacc0x4), vget_high_s32(vacc0x4));
+    const int32x2_t vpsum0x5 = vadd_s32(vget_low_s32(vacc0x5), vget_high_s32(vacc0x5));
+    const int32x2_t vpsum0x6 = vadd_s32(vget_low_s32(vacc0x6), vget_high_s32(vacc0x6));
+    const int32x2_t vpsum0x7 = vadd_s32(vget_low_s32(vacc0x7), vget_high_s32(vacc0x7));
+    const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5);
+    const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7);
+    int32x4_t vacc0x4567 = vcombine_s32(vsum0x45, vsum0x67 );
+    const int32x2_t vpsum1x0 = vadd_s32(vget_low_s32(vacc1x0), vget_high_s32(vacc1x0));
+    const int32x2_t vpsum1x1 = vadd_s32(vget_low_s32(vacc1x1), vget_high_s32(vacc1x1));
+    const int32x2_t vpsum1x2 = vadd_s32(vget_low_s32(vacc1x2), vget_high_s32(vacc1x2));
+    const int32x2_t vpsum1x3 = vadd_s32(vget_low_s32(vacc1x3), vget_high_s32(vacc1x3));
+    const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1);
+    const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3);
+    int32x4_t vacc1x0123 = vcombine_s32(vsum1x01, vsum1x23 );
+    const int32x2_t vpsum1x4 = vadd_s32(vget_low_s32(vacc1x4), vget_high_s32(vacc1x4));
+    const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5));
+    const int32x2_t vpsum1x6 = vadd_s32(vget_low_s32(vacc1x6), vget_high_s32(vacc1x6));
+    const int32x2_t vpsum1x7 = vadd_s32(vget_low_s32(vacc1x7), vget_high_s32(vacc1x7));
+    const int32x2_t vsum1x45 = vpadd_s32(vpsum1x4, vpsum1x5);
+    const int32x2_t vsum1x67 = vpadd_s32(vpsum1x6, vpsum1x7);
+    int32x4_t vacc1x4567 = vcombine_s32(vsum1x45, vsum1x67 );
+    const int32x2_t vpsum2x0 = vadd_s32(vget_low_s32(vacc2x0), vget_high_s32(vacc2x0));
+    const int32x2_t vpsum2x1 = vadd_s32(vget_low_s32(vacc2x1), vget_high_s32(vacc2x1));
+    const int32x2_t vpsum2x2 = vadd_s32(vget_low_s32(vacc2x2), vget_high_s32(vacc2x2));
+    const int32x2_t vpsum2x3 = vadd_s32(vget_low_s32(vacc2x3), vget_high_s32(vacc2x3));
+    const int32x2_t vsum2x01 = vpadd_s32(vpsum2x0, vpsum2x1);
+    const int32x2_t vsum2x23 = vpadd_s32(vpsum2x2, vpsum2x3);
+    int32x4_t vacc2x0123 = vcombine_s32(vsum2x01, vsum2x23 );
+    const int32x2_t vpsum2x4 = vadd_s32(vget_low_s32(vacc2x4), vget_high_s32(vacc2x4));
+    const int32x2_t vpsum2x5 = vadd_s32(vget_low_s32(vacc2x5), vget_high_s32(vacc2x5));
+    const int32x2_t vpsum2x6 = vadd_s32(vget_low_s32(vacc2x6), vget_high_s32(vacc2x6));
+    const int32x2_t vpsum2x7 = vadd_s32(vget_low_s32(vacc2x7), vget_high_s32(vacc2x7));
+    const int32x2_t vsum2x45 = vpadd_s32(vpsum2x4, vpsum2x5);
+    const int32x2_t vsum2x67 = vpadd_s32(vpsum2x6, vpsum2x7);
+    int32x4_t vacc2x4567 = vcombine_s32(vsum2x45, vsum2x67 );
+#endif
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+    int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
+    int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
+    int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567);
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout2x01234567 = vmax_s8(vout2x01234567, vget_low_s8(voutput_min));
+    vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);
+
+    vout2x01234567 = vmin_s8(vout2x01234567, vget_low_s8(voutput_max));
+    vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);
+
+    if (nc >= 8) {
+      vst1_s8(c2 + 0, vout2x01234567);
+      vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
+      vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
+
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1_lane_s8(c2, vout2x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/3x8c2-minmax-neon-mlal-padal-dup.c b/src/qs8-igemm/gen/3x8c2-minmax-neon-mlal-padal-dup.c
new file mode 100644
index 0000000..bb5b54f
--- /dev/null
+++ b/src/qs8-igemm/gen/3x8c2-minmax-neon-mlal-padal-dup.c
@@ -0,0 +1,392 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/c2-neon-mull-padal-dup.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (3 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc1x0123 = vacc0x0123;
+    int32x4_t vacc1x4567 = vacc0x4567;
+    int32x4_t vacc2x0123 = vacc0x0123;
+    int32x4_t vacc2x4567 = vacc0x4567;
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      a += 3;
+
+      size_t k = kc;
+
+      while (k >= 16 * sizeof(int8_t)) {
+        const int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va0x1 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va1x0 = vld1_s8(a1); a1 += 8;
+        const int8x8_t va1x1 = vld1_s8(a1); a1 += 8;
+        const int8x8_t va2x0 = vld1_s8(a2); a2 += 8;
+        const int8x8_t va2x1 = vld1_s8(a2); a2 += 8;
+
+        const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+        int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+        int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0)));
+        const int8x8_t vb0123c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+        vprod1x0123c0 = vmlal_s8(vprod1x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+        vprod2x0123c0 = vmlal_s8(vprod2x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 0)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+        int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+        int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+        int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0)));
+        const int8x8_t vb4567c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+        vprod1x4567c0 = vmlal_s8(vprod1x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+        vprod2x4567c0 = vmlal_s8(vprod2x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 0)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+        int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+        int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+        int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1)));
+        const int8x8_t vb0123c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x0123c1 = vmlal_s8(vprod0x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+        vprod1x0123c1 = vmlal_s8(vprod1x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+        vprod2x0123c1 = vmlal_s8(vprod2x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 1)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+        int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+        int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+        int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1)));
+        const int8x8_t vb4567c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x4567c1 = vmlal_s8(vprod0x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+        vprod1x4567c1 = vmlal_s8(vprod1x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+        vprod2x4567c1 = vmlal_s8(vprod2x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 1)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+        int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+        int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+        int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 2)));
+        const int8x8_t vb0123c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x0123c2 = vmlal_s8(vprod0x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+        vprod1x0123c2 = vmlal_s8(vprod1x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+        vprod2x0123c2 = vmlal_s8(vprod2x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 2)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+        int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+        int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+        int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 2)));
+        const int8x8_t vb4567c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x4567c2 = vmlal_s8(vprod0x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+        vprod1x4567c2 = vmlal_s8(vprod1x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+        vprod2x4567c2 = vmlal_s8(vprod2x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 2)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+        int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+        int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+        int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 3)));
+        const int8x8_t vb0123c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x0123c3 = vmlal_s8(vprod0x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+        vprod1x0123c3 = vmlal_s8(vprod1x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+        vprod2x0123c3 = vmlal_s8(vprod2x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 3)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
+        int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+        int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+        int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 3)));
+        const int8x8_t vb4567c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x4567c3 = vmlal_s8(vprod0x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+        vprod1x4567c3 = vmlal_s8(vprod1x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+        vprod2x4567c3 = vmlal_s8(vprod2x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 3)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
+
+        k -= 16 * sizeof(int8_t);
+      }
+
+      if (k >= 8 * sizeof(int8_t)) {
+        const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+        const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+
+        const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+        const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+        const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+        const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+        const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+        const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+        const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+        const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+        const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+        const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+        const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+        const int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
+        const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+        const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+        const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+        const int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
+
+        k -= 8 * sizeof(int8_t);
+      }
+
+      if XNN_UNLIKELY(k != 0) {
+        const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+        const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+        const int8x8_t va2 = vld1_s8(a2); a2 = (const int8_t*) ((uintptr_t) a2 + k);
+
+        const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+        const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+        const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+        const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+        const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+        const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+
+        if (k > 2 * sizeof(int8_t)) {
+          const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+          vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+          const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+          vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+          const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+          vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+          const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+          vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+          const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+          vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+          const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+          vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+
+          if (k > 4 * sizeof(int8_t)) {
+            const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+            const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+            vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+            const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+            vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+            const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+            vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+            const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+            vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+            const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+            vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+            const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+            vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+          }
+        }
+      }
+      p -= 3 * sizeof(void*);
+    } while (p != 0);
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
+    int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
+    int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567);
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout2x01234567 = vmax_s8(vout2x01234567, vget_low_s8(voutput_min));
+    vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);
+
+    vout2x01234567 = vmin_s8(vout2x01234567, vget_low_s8(voutput_max));
+    vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);
+
+    if (nc >= 8) {
+      vst1_s8(c2 + 0, vout2x01234567);
+      vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
+      vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
+
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1_lane_s8(c2, vout2x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/3x8c2-minmax-neon-mull-padal-dup.c b/src/qs8-igemm/gen/3x8c2-minmax-neon-mull-padal-dup.c
new file mode 100644
index 0000000..0177c18
--- /dev/null
+++ b/src/qs8-igemm/gen/3x8c2-minmax-neon-mull-padal-dup.c
@@ -0,0 +1,292 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/c2-neon-mull-padal-dup.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (3 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc1x0123 = vacc0x0123;
+    int32x4_t vacc1x4567 = vacc0x4567;
+    int32x4_t vacc2x0123 = vacc0x0123;
+    int32x4_t vacc2x4567 = vacc0x4567;
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      a += 3;
+
+      size_t k = kc;
+
+
+      while (k >= 8 * sizeof(int8_t)) {
+        const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+        const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+
+        const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+        const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+        const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+        const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+        const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+        const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+        const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+        const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+        const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+        const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+        const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+        const int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
+        const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+        const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+        const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+        const int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
+
+        k -= 8 * sizeof(int8_t);
+      }
+
+      if XNN_UNLIKELY(k != 0) {
+        const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+        const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+        const int8x8_t va2 = vld1_s8(a2); a2 = (const int8_t*) ((uintptr_t) a2 + k);
+
+        const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+        const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+        const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+        const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+        const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+        const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+
+        if (k > 2 * sizeof(int8_t)) {
+          const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+          vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+          const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+          vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+          const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+          vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+          const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+          vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+          const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+          vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+          const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+          vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+
+          if (k > 4 * sizeof(int8_t)) {
+            const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+            const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+            vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+            const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+            vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+            const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+            vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+            const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+            vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+            const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+            vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+            const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+            vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+          }
+        }
+      }
+      p -= 3 * sizeof(void*);
+    } while (p != 0);
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
+    int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
+    int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567);
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout2x01234567 = vmax_s8(vout2x01234567, vget_low_s8(voutput_min));
+    vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);
+
+    vout2x01234567 = vmin_s8(vout2x01234567, vget_low_s8(voutput_max));
+    vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);
+
+    if (nc >= 8) {
+      vst1_s8(c2 + 0, vout2x01234567);
+      vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
+      vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
+
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1_lane_s8(c2, vout2x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/3x8c8-minmax-avx2.c b/src/qs8-igemm/gen/3x8c8-minmax-avx2.c
index 4269041..231c884 100644
--- a/src/qs8-igemm/gen/3x8c8-minmax-avx2.c
+++ b/src/qs8-igemm/gen/3x8c8-minmax-avx2.c
@@ -13,6 +13,7 @@
 
 #include <xnnpack/igemm.h>
 #include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2(
@@ -40,6 +41,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
diff --git a/src/qs8-igemm/gen/3x8c8-minmax-neon-mlal-padal.c b/src/qs8-igemm/gen/3x8c8-minmax-neon-mlal-padal.c
new file mode 100644
index 0000000..f5db513
--- /dev/null
+++ b/src/qs8-igemm/gen/3x8c8-minmax-neon-mlal-padal.c
@@ -0,0 +1,416 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/c8-neon-mull-padal.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (3 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+
+  do {
+    int32x4_t vacc0x0 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x1 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x2 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x3 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x4 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x5 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc1x0 = vacc0x0;
+    int32x4_t vacc1x1 = vacc0x1;
+    int32x4_t vacc1x2 = vacc0x2;
+    int32x4_t vacc1x3 = vacc0x3;
+    int32x4_t vacc1x4 = vacc0x4;
+    int32x4_t vacc1x5 = vacc0x5;
+    int32x4_t vacc1x6 = vacc0x6;
+    int32x4_t vacc1x7 = vacc0x7;
+    int32x4_t vacc2x0 = vacc0x0;
+    int32x4_t vacc2x1 = vacc0x1;
+    int32x4_t vacc2x2 = vacc0x2;
+    int32x4_t vacc2x3 = vacc0x3;
+    int32x4_t vacc2x4 = vacc0x4;
+    int32x4_t vacc2x5 = vacc0x5;
+    int32x4_t vacc2x6 = vacc0x6;
+    int32x4_t vacc2x7 = vacc0x7;
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      a += 3;
+
+      size_t k = kc;
+      // 2x partial unrolled loop to load 16 bytes at a time using MLA.
+      while (k >= 16 * sizeof(int8_t)) {
+        const int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va0x1 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va1x0 = vld1_s8(a1); a1 += 8;
+        const int8x8_t va1x1 = vld1_s8(a1); a1 += 8;
+        const int8x8_t va2x0 = vld1_s8(a2); a2 += 8;
+        const int8x8_t va2x1 = vld1_s8(a2); a2 += 8;
+
+        const int8x8_t vb0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb4x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb5x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb6x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb7x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+
+        const int8x8_t vb0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x0 = vmull_s8(vb0x0, va0x0);
+        int16x8_t vprod1x0 = vmull_s8(vb0x0, va1x0);
+        int16x8_t vprod2x0 = vmull_s8(vb0x0, va2x0);
+        vprod0x0 = vmlal_s8(vprod0x0, vb0x1, va0x1);
+        vprod1x0 = vmlal_s8(vprod1x0, vb0x1, va1x1);
+        vprod2x0 = vmlal_s8(vprod2x0, vb0x1, va2x1);
+        vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+        vacc1x0 = vpadalq_s16(vacc1x0, vprod1x0);
+        vacc2x0 = vpadalq_s16(vacc2x0, vprod2x0);
+        const int8x8_t vb1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x1 = vmull_s8(vb1x0, va0x0);
+        int16x8_t vprod1x1 = vmull_s8(vb1x0, va1x0);
+        int16x8_t vprod2x1 = vmull_s8(vb1x0, va2x0);
+        vprod0x1 = vmlal_s8(vprod0x1, vb1x1, va0x1);
+        vprod1x1 = vmlal_s8(vprod1x1, vb1x1, va1x1);
+        vprod2x1 = vmlal_s8(vprod2x1, vb1x1, va2x1);
+        vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+        vacc1x1 = vpadalq_s16(vacc1x1, vprod1x1);
+        vacc2x1 = vpadalq_s16(vacc2x1, vprod2x1);
+        const int8x8_t vb2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x2 = vmull_s8(vb2x0, va0x0);
+        int16x8_t vprod1x2 = vmull_s8(vb2x0, va1x0);
+        int16x8_t vprod2x2 = vmull_s8(vb2x0, va2x0);
+        vprod0x2 = vmlal_s8(vprod0x2, vb2x1, va0x1);
+        vprod1x2 = vmlal_s8(vprod1x2, vb2x1, va1x1);
+        vprod2x2 = vmlal_s8(vprod2x2, vb2x1, va2x1);
+        vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+        vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2);
+        vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2);
+        const int8x8_t vb3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x3 = vmull_s8(vb3x0, va0x0);
+        int16x8_t vprod1x3 = vmull_s8(vb3x0, va1x0);
+        int16x8_t vprod2x3 = vmull_s8(vb3x0, va2x0);
+        vprod0x3 = vmlal_s8(vprod0x3, vb3x1, va0x1);
+        vprod1x3 = vmlal_s8(vprod1x3, vb3x1, va1x1);
+        vprod2x3 = vmlal_s8(vprod2x3, vb3x1, va2x1);
+        vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+        vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3);
+        vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3);
+        const int8x8_t vb4x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x4 = vmull_s8(vb4x0, va0x0);
+        int16x8_t vprod1x4 = vmull_s8(vb4x0, va1x0);
+        int16x8_t vprod2x4 = vmull_s8(vb4x0, va2x0);
+        vprod0x4 = vmlal_s8(vprod0x4, vb4x1, va0x1);
+        vprod1x4 = vmlal_s8(vprod1x4, vb4x1, va1x1);
+        vprod2x4 = vmlal_s8(vprod2x4, vb4x1, va2x1);
+        vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+        vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4);
+        vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4);
+        const int8x8_t vb5x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x5 = vmull_s8(vb5x0, va0x0);
+        int16x8_t vprod1x5 = vmull_s8(vb5x0, va1x0);
+        int16x8_t vprod2x5 = vmull_s8(vb5x0, va2x0);
+        vprod0x5 = vmlal_s8(vprod0x5, vb5x1, va0x1);
+        vprod1x5 = vmlal_s8(vprod1x5, vb5x1, va1x1);
+        vprod2x5 = vmlal_s8(vprod2x5, vb5x1, va2x1);
+        vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+        vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5);
+        vacc2x5 = vpadalq_s16(vacc2x5, vprod2x5);
+        const int8x8_t vb6x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x6 = vmull_s8(vb6x0, va0x0);
+        int16x8_t vprod1x6 = vmull_s8(vb6x0, va1x0);
+        int16x8_t vprod2x6 = vmull_s8(vb6x0, va2x0);
+        vprod0x6 = vmlal_s8(vprod0x6, vb6x1, va0x1);
+        vprod1x6 = vmlal_s8(vprod1x6, vb6x1, va1x1);
+        vprod2x6 = vmlal_s8(vprod2x6, vb6x1, va2x1);
+        vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+        vacc1x6 = vpadalq_s16(vacc1x6, vprod1x6);
+        vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6);
+        const int8x8_t vb7x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x7 = vmull_s8(vb7x0, va0x0);
+        int16x8_t vprod1x7 = vmull_s8(vb7x0, va1x0);
+        int16x8_t vprod2x7 = vmull_s8(vb7x0, va2x0);
+        vprod0x7 = vmlal_s8(vprod0x7, vb7x1, va0x1);
+        vprod1x7 = vmlal_s8(vprod1x7, vb7x1, va1x1);
+        vprod2x7 = vmlal_s8(vprod2x7, vb7x1, va2x1);
+        vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+        vacc1x7 = vpadalq_s16(vacc1x7, vprod1x7);
+        vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7);
+
+        k -= 16 * sizeof(int8_t);
+      }
+
+      // Handle 8 bytes at a time using MUL.
+      if (k > 0) {
+        const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+        const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+
+        const int8x8_t vb0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x0 = vmull_s8(vb0, va0);
+        const int16x8_t vprod1x0 = vmull_s8(vb0, va1);
+        const int16x8_t vprod2x0 = vmull_s8(vb0, va2);
+        vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+        vacc1x0 = vpadalq_s16(vacc1x0, vprod1x0);
+        vacc2x0 = vpadalq_s16(vacc2x0, vprod2x0);
+        const int8x8_t vb1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x1 = vmull_s8(vb1, va0);
+        const int16x8_t vprod1x1 = vmull_s8(vb1, va1);
+        const int16x8_t vprod2x1 = vmull_s8(vb1, va2);
+        vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+        vacc1x1 = vpadalq_s16(vacc1x1, vprod1x1);
+        vacc2x1 = vpadalq_s16(vacc2x1, vprod2x1);
+        const int8x8_t vb2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x2 = vmull_s8(vb2, va0);
+        const int16x8_t vprod1x2 = vmull_s8(vb2, va1);
+        const int16x8_t vprod2x2 = vmull_s8(vb2, va2);
+        vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+        vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2);
+        vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2);
+        const int8x8_t vb3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x3 = vmull_s8(vb3, va0);
+        const int16x8_t vprod1x3 = vmull_s8(vb3, va1);
+        const int16x8_t vprod2x3 = vmull_s8(vb3, va2);
+        vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+        vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3);
+        vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3);
+        const int8x8_t vb4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x4 = vmull_s8(vb4, va0);
+        const int16x8_t vprod1x4 = vmull_s8(vb4, va1);
+        const int16x8_t vprod2x4 = vmull_s8(vb4, va2);
+        vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+        vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4);
+        vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4);
+        const int8x8_t vb5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x5 = vmull_s8(vb5, va0);
+        const int16x8_t vprod1x5 = vmull_s8(vb5, va1);
+        const int16x8_t vprod2x5 = vmull_s8(vb5, va2);
+        vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+        vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5);
+        vacc2x5 = vpadalq_s16(vacc2x5, vprod2x5);
+        const int8x8_t vb6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x6 = vmull_s8(vb6, va0);
+        const int16x8_t vprod1x6 = vmull_s8(vb6, va1);
+        const int16x8_t vprod2x6 = vmull_s8(vb6, va2);
+        vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+        vacc1x6 = vpadalq_s16(vacc1x6, vprod1x6);
+        vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6);
+        const int8x8_t vb7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x7 = vmull_s8(vb7, va0);
+        const int16x8_t vprod1x7 = vmull_s8(vb7, va1);
+        const int16x8_t vprod2x7 = vmull_s8(vb7, va2);
+        vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+        vacc1x7 = vpadalq_s16(vacc1x7, vprod1x7);
+        vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7);
+
+        k -= 8 * sizeof(int8_t);
+      }
+
+      p -= 3 * sizeof(void*);
+    } while (p != 0);
+
+#if XNN_ARCH_ARM64
+    const int32x4_t vsum0x01 = vpaddq_s32(vacc0x0, vacc0x1);
+    const int32x4_t vsum0x23 = vpaddq_s32(vacc0x2, vacc0x3);
+    const int32x4_t vsum0x45 = vpaddq_s32(vacc0x4, vacc0x5);
+    const int32x4_t vsum0x67 = vpaddq_s32(vacc0x6, vacc0x7);
+    const int32x4_t vsum1x01 = vpaddq_s32(vacc1x0, vacc1x1);
+    const int32x4_t vsum1x23 = vpaddq_s32(vacc1x2, vacc1x3);
+    const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5);
+    const int32x4_t vsum1x67 = vpaddq_s32(vacc1x6, vacc1x7);
+    const int32x4_t vsum2x01 = vpaddq_s32(vacc2x0, vacc2x1);
+    const int32x4_t vsum2x23 = vpaddq_s32(vacc2x2, vacc2x3);
+    const int32x4_t vsum2x45 = vpaddq_s32(vacc2x4, vacc2x5);
+    const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7);
+    int32x4_t vacc0x0123 = vpaddq_s32(vsum0x01, vsum0x23);
+    int32x4_t vacc0x4567 = vpaddq_s32(vsum0x45, vsum0x67);
+    int32x4_t vacc1x0123 = vpaddq_s32(vsum1x01, vsum1x23);
+    int32x4_t vacc1x4567 = vpaddq_s32(vsum1x45, vsum1x67);
+    int32x4_t vacc2x0123 = vpaddq_s32(vsum2x01, vsum2x23);
+    int32x4_t vacc2x4567 = vpaddq_s32(vsum2x45, vsum2x67);
+#else
+    const int32x2_t vpsum0x0 = vadd_s32(vget_low_s32(vacc0x0), vget_high_s32(vacc0x0));
+    const int32x2_t vpsum0x1 = vadd_s32(vget_low_s32(vacc0x1), vget_high_s32(vacc0x1));
+    const int32x2_t vpsum0x2 = vadd_s32(vget_low_s32(vacc0x2), vget_high_s32(vacc0x2));
+    const int32x2_t vpsum0x3 = vadd_s32(vget_low_s32(vacc0x3), vget_high_s32(vacc0x3));
+    const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1);
+    const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3);
+    int32x4_t vacc0x0123 = vcombine_s32(vsum0x01, vsum0x23 );
+    const int32x2_t vpsum0x4 = vadd_s32(vget_low_s32(vacc0x4), vget_high_s32(vacc0x4));
+    const int32x2_t vpsum0x5 = vadd_s32(vget_low_s32(vacc0x5), vget_high_s32(vacc0x5));
+    const int32x2_t vpsum0x6 = vadd_s32(vget_low_s32(vacc0x6), vget_high_s32(vacc0x6));
+    const int32x2_t vpsum0x7 = vadd_s32(vget_low_s32(vacc0x7), vget_high_s32(vacc0x7));
+    const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5);
+    const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7);
+    int32x4_t vacc0x4567 = vcombine_s32(vsum0x45, vsum0x67 );
+    const int32x2_t vpsum1x0 = vadd_s32(vget_low_s32(vacc1x0), vget_high_s32(vacc1x0));
+    const int32x2_t vpsum1x1 = vadd_s32(vget_low_s32(vacc1x1), vget_high_s32(vacc1x1));
+    const int32x2_t vpsum1x2 = vadd_s32(vget_low_s32(vacc1x2), vget_high_s32(vacc1x2));
+    const int32x2_t vpsum1x3 = vadd_s32(vget_low_s32(vacc1x3), vget_high_s32(vacc1x3));
+    const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1);
+    const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3);
+    int32x4_t vacc1x0123 = vcombine_s32(vsum1x01, vsum1x23 );
+    const int32x2_t vpsum1x4 = vadd_s32(vget_low_s32(vacc1x4), vget_high_s32(vacc1x4));
+    const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5));
+    const int32x2_t vpsum1x6 = vadd_s32(vget_low_s32(vacc1x6), vget_high_s32(vacc1x6));
+    const int32x2_t vpsum1x7 = vadd_s32(vget_low_s32(vacc1x7), vget_high_s32(vacc1x7));
+    const int32x2_t vsum1x45 = vpadd_s32(vpsum1x4, vpsum1x5);
+    const int32x2_t vsum1x67 = vpadd_s32(vpsum1x6, vpsum1x7);
+    int32x4_t vacc1x4567 = vcombine_s32(vsum1x45, vsum1x67 );
+    const int32x2_t vpsum2x0 = vadd_s32(vget_low_s32(vacc2x0), vget_high_s32(vacc2x0));
+    const int32x2_t vpsum2x1 = vadd_s32(vget_low_s32(vacc2x1), vget_high_s32(vacc2x1));
+    const int32x2_t vpsum2x2 = vadd_s32(vget_low_s32(vacc2x2), vget_high_s32(vacc2x2));
+    const int32x2_t vpsum2x3 = vadd_s32(vget_low_s32(vacc2x3), vget_high_s32(vacc2x3));
+    const int32x2_t vsum2x01 = vpadd_s32(vpsum2x0, vpsum2x1);
+    const int32x2_t vsum2x23 = vpadd_s32(vpsum2x2, vpsum2x3);
+    int32x4_t vacc2x0123 = vcombine_s32(vsum2x01, vsum2x23 );
+    const int32x2_t vpsum2x4 = vadd_s32(vget_low_s32(vacc2x4), vget_high_s32(vacc2x4));
+    const int32x2_t vpsum2x5 = vadd_s32(vget_low_s32(vacc2x5), vget_high_s32(vacc2x5));
+    const int32x2_t vpsum2x6 = vadd_s32(vget_low_s32(vacc2x6), vget_high_s32(vacc2x6));
+    const int32x2_t vpsum2x7 = vadd_s32(vget_low_s32(vacc2x7), vget_high_s32(vacc2x7));
+    const int32x2_t vsum2x45 = vpadd_s32(vpsum2x4, vpsum2x5);
+    const int32x2_t vsum2x67 = vpadd_s32(vpsum2x6, vpsum2x7);
+    int32x4_t vacc2x4567 = vcombine_s32(vsum2x45, vsum2x67 );
+#endif
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+    int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
+    int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
+    int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567);
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout2x01234567 = vmax_s8(vout2x01234567, vget_low_s8(voutput_min));
+    vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);
+
+    vout2x01234567 = vmin_s8(vout2x01234567, vget_low_s8(voutput_max));
+    vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);
+
+    if (nc >= 8) {
+      vst1_s8(c2 + 0, vout2x01234567);
+      vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
+      vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
+
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1_lane_s8(c2, vout2x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/3x8c8-minmax-neon-mull-padal.c b/src/qs8-igemm/gen/3x8c8-minmax-neon-mull-padal.c
new file mode 100644
index 0000000..c6e2fb7
--- /dev/null
+++ b/src/qs8-igemm/gen/3x8c8-minmax-neon-mull-padal.c
@@ -0,0 +1,315 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/c8-neon-mull-padal.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (3 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+
+  do {
+    int32x4_t vacc0x0 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x1 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x2 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x3 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x4 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x5 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc1x0 = vacc0x0;
+    int32x4_t vacc1x1 = vacc0x1;
+    int32x4_t vacc1x2 = vacc0x2;
+    int32x4_t vacc1x3 = vacc0x3;
+    int32x4_t vacc1x4 = vacc0x4;
+    int32x4_t vacc1x5 = vacc0x5;
+    int32x4_t vacc1x6 = vacc0x6;
+    int32x4_t vacc1x7 = vacc0x7;
+    int32x4_t vacc2x0 = vacc0x0;
+    int32x4_t vacc2x1 = vacc0x1;
+    int32x4_t vacc2x2 = vacc0x2;
+    int32x4_t vacc2x3 = vacc0x3;
+    int32x4_t vacc2x4 = vacc0x4;
+    int32x4_t vacc2x5 = vacc0x5;
+    int32x4_t vacc2x6 = vacc0x6;
+    int32x4_t vacc2x7 = vacc0x7;
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      a += 3;
+
+      size_t k = kc;
+
+      // Handle 8 bytes at a time using MUL.
+      while (k > 0) {
+        const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+        const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+
+        const int8x8_t vb0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x0 = vmull_s8(vb0, va0);
+        const int16x8_t vprod1x0 = vmull_s8(vb0, va1);
+        const int16x8_t vprod2x0 = vmull_s8(vb0, va2);
+        vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+        vacc1x0 = vpadalq_s16(vacc1x0, vprod1x0);
+        vacc2x0 = vpadalq_s16(vacc2x0, vprod2x0);
+        const int8x8_t vb1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x1 = vmull_s8(vb1, va0);
+        const int16x8_t vprod1x1 = vmull_s8(vb1, va1);
+        const int16x8_t vprod2x1 = vmull_s8(vb1, va2);
+        vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+        vacc1x1 = vpadalq_s16(vacc1x1, vprod1x1);
+        vacc2x1 = vpadalq_s16(vacc2x1, vprod2x1);
+        const int8x8_t vb2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x2 = vmull_s8(vb2, va0);
+        const int16x8_t vprod1x2 = vmull_s8(vb2, va1);
+        const int16x8_t vprod2x2 = vmull_s8(vb2, va2);
+        vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+        vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2);
+        vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2);
+        const int8x8_t vb3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x3 = vmull_s8(vb3, va0);
+        const int16x8_t vprod1x3 = vmull_s8(vb3, va1);
+        const int16x8_t vprod2x3 = vmull_s8(vb3, va2);
+        vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+        vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3);
+        vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3);
+        const int8x8_t vb4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x4 = vmull_s8(vb4, va0);
+        const int16x8_t vprod1x4 = vmull_s8(vb4, va1);
+        const int16x8_t vprod2x4 = vmull_s8(vb4, va2);
+        vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+        vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4);
+        vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4);
+        const int8x8_t vb5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x5 = vmull_s8(vb5, va0);
+        const int16x8_t vprod1x5 = vmull_s8(vb5, va1);
+        const int16x8_t vprod2x5 = vmull_s8(vb5, va2);
+        vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+        vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5);
+        vacc2x5 = vpadalq_s16(vacc2x5, vprod2x5);
+        const int8x8_t vb6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x6 = vmull_s8(vb6, va0);
+        const int16x8_t vprod1x6 = vmull_s8(vb6, va1);
+        const int16x8_t vprod2x6 = vmull_s8(vb6, va2);
+        vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+        vacc1x6 = vpadalq_s16(vacc1x6, vprod1x6);
+        vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6);
+        const int8x8_t vb7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x7 = vmull_s8(vb7, va0);
+        const int16x8_t vprod1x7 = vmull_s8(vb7, va1);
+        const int16x8_t vprod2x7 = vmull_s8(vb7, va2);
+        vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+        vacc1x7 = vpadalq_s16(vacc1x7, vprod1x7);
+        vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7);
+
+        k -= 8 * sizeof(int8_t);
+      }
+
+      p -= 3 * sizeof(void*);
+    } while (p != 0);
+
+#if XNN_ARCH_ARM64
+    const int32x4_t vsum0x01 = vpaddq_s32(vacc0x0, vacc0x1);
+    const int32x4_t vsum0x23 = vpaddq_s32(vacc0x2, vacc0x3);
+    const int32x4_t vsum0x45 = vpaddq_s32(vacc0x4, vacc0x5);
+    const int32x4_t vsum0x67 = vpaddq_s32(vacc0x6, vacc0x7);
+    const int32x4_t vsum1x01 = vpaddq_s32(vacc1x0, vacc1x1);
+    const int32x4_t vsum1x23 = vpaddq_s32(vacc1x2, vacc1x3);
+    const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5);
+    const int32x4_t vsum1x67 = vpaddq_s32(vacc1x6, vacc1x7);
+    const int32x4_t vsum2x01 = vpaddq_s32(vacc2x0, vacc2x1);
+    const int32x4_t vsum2x23 = vpaddq_s32(vacc2x2, vacc2x3);
+    const int32x4_t vsum2x45 = vpaddq_s32(vacc2x4, vacc2x5);
+    const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7);
+    int32x4_t vacc0x0123 = vpaddq_s32(vsum0x01, vsum0x23);
+    int32x4_t vacc0x4567 = vpaddq_s32(vsum0x45, vsum0x67);
+    int32x4_t vacc1x0123 = vpaddq_s32(vsum1x01, vsum1x23);
+    int32x4_t vacc1x4567 = vpaddq_s32(vsum1x45, vsum1x67);
+    int32x4_t vacc2x0123 = vpaddq_s32(vsum2x01, vsum2x23);
+    int32x4_t vacc2x4567 = vpaddq_s32(vsum2x45, vsum2x67);
+#else
+    const int32x2_t vpsum0x0 = vadd_s32(vget_low_s32(vacc0x0), vget_high_s32(vacc0x0));
+    const int32x2_t vpsum0x1 = vadd_s32(vget_low_s32(vacc0x1), vget_high_s32(vacc0x1));
+    const int32x2_t vpsum0x2 = vadd_s32(vget_low_s32(vacc0x2), vget_high_s32(vacc0x2));
+    const int32x2_t vpsum0x3 = vadd_s32(vget_low_s32(vacc0x3), vget_high_s32(vacc0x3));
+    const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1);
+    const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3);
+    int32x4_t vacc0x0123 = vcombine_s32(vsum0x01, vsum0x23 );
+    const int32x2_t vpsum0x4 = vadd_s32(vget_low_s32(vacc0x4), vget_high_s32(vacc0x4));
+    const int32x2_t vpsum0x5 = vadd_s32(vget_low_s32(vacc0x5), vget_high_s32(vacc0x5));
+    const int32x2_t vpsum0x6 = vadd_s32(vget_low_s32(vacc0x6), vget_high_s32(vacc0x6));
+    const int32x2_t vpsum0x7 = vadd_s32(vget_low_s32(vacc0x7), vget_high_s32(vacc0x7));
+    const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5);
+    const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7);
+    int32x4_t vacc0x4567 = vcombine_s32(vsum0x45, vsum0x67 );
+    const int32x2_t vpsum1x0 = vadd_s32(vget_low_s32(vacc1x0), vget_high_s32(vacc1x0));
+    const int32x2_t vpsum1x1 = vadd_s32(vget_low_s32(vacc1x1), vget_high_s32(vacc1x1));
+    const int32x2_t vpsum1x2 = vadd_s32(vget_low_s32(vacc1x2), vget_high_s32(vacc1x2));
+    const int32x2_t vpsum1x3 = vadd_s32(vget_low_s32(vacc1x3), vget_high_s32(vacc1x3));
+    const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1);
+    const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3);
+    int32x4_t vacc1x0123 = vcombine_s32(vsum1x01, vsum1x23 );
+    const int32x2_t vpsum1x4 = vadd_s32(vget_low_s32(vacc1x4), vget_high_s32(vacc1x4));
+    const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5));
+    const int32x2_t vpsum1x6 = vadd_s32(vget_low_s32(vacc1x6), vget_high_s32(vacc1x6));
+    const int32x2_t vpsum1x7 = vadd_s32(vget_low_s32(vacc1x7), vget_high_s32(vacc1x7));
+    const int32x2_t vsum1x45 = vpadd_s32(vpsum1x4, vpsum1x5);
+    const int32x2_t vsum1x67 = vpadd_s32(vpsum1x6, vpsum1x7);
+    int32x4_t vacc1x4567 = vcombine_s32(vsum1x45, vsum1x67 );
+    const int32x2_t vpsum2x0 = vadd_s32(vget_low_s32(vacc2x0), vget_high_s32(vacc2x0));
+    const int32x2_t vpsum2x1 = vadd_s32(vget_low_s32(vacc2x1), vget_high_s32(vacc2x1));
+    const int32x2_t vpsum2x2 = vadd_s32(vget_low_s32(vacc2x2), vget_high_s32(vacc2x2));
+    const int32x2_t vpsum2x3 = vadd_s32(vget_low_s32(vacc2x3), vget_high_s32(vacc2x3));
+    const int32x2_t vsum2x01 = vpadd_s32(vpsum2x0, vpsum2x1);
+    const int32x2_t vsum2x23 = vpadd_s32(vpsum2x2, vpsum2x3);
+    int32x4_t vacc2x0123 = vcombine_s32(vsum2x01, vsum2x23 );
+    const int32x2_t vpsum2x4 = vadd_s32(vget_low_s32(vacc2x4), vget_high_s32(vacc2x4));
+    const int32x2_t vpsum2x5 = vadd_s32(vget_low_s32(vacc2x5), vget_high_s32(vacc2x5));
+    const int32x2_t vpsum2x6 = vadd_s32(vget_low_s32(vacc2x6), vget_high_s32(vacc2x6));
+    const int32x2_t vpsum2x7 = vadd_s32(vget_low_s32(vacc2x7), vget_high_s32(vacc2x7));
+    const int32x2_t vsum2x45 = vpadd_s32(vpsum2x4, vpsum2x5);
+    const int32x2_t vsum2x67 = vpadd_s32(vpsum2x6, vpsum2x7);
+    int32x4_t vacc2x4567 = vcombine_s32(vsum2x45, vsum2x67 );
+#endif
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+    int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
+    int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
+    int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567);
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout2x01234567 = vmax_s8(vout2x01234567, vget_low_s8(voutput_min));
+    vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);
+
+    vout2x01234567 = vmin_s8(vout2x01234567, vget_low_s8(voutput_max));
+    vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);
+
+    if (nc >= 8) {
+      vst1_s8(c2 + 0, vout2x01234567);
+      vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
+      vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
+
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1_lane_s8(c2, vout2x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/4x16-minmax-neon-mlal-lane.c b/src/qs8-igemm/gen/4x16-minmax-neon-mlal-lane.c
new file mode 100644
index 0000000..1a301d9
--- /dev/null
+++ b/src/qs8-igemm/gen/4x16-minmax-neon-mlal-lane.c
@@ -0,0 +1,613 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/neon-mlal-lane.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/igemm.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (4 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+  int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    c3 = c2;
+  }
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc1x0123 = vacc0x0123;
+    int32x4_t vacc1x4567 = vacc0x4567;
+    int32x4_t vacc1x89AB = vacc0x89AB;
+    int32x4_t vacc1xCDEF = vacc0xCDEF;
+    int32x4_t vacc2x0123 = vacc0x0123;
+    int32x4_t vacc2x4567 = vacc0x4567;
+    int32x4_t vacc2x89AB = vacc0x89AB;
+    int32x4_t vacc2xCDEF = vacc0xCDEF;
+    int32x4_t vacc3x0123 = vacc0x0123;
+    int32x4_t vacc3x4567 = vacc0x4567;
+    int32x4_t vacc3x89AB = vacc0x89AB;
+    int32x4_t vacc3xCDEF = vacc0xCDEF;
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      const int8_t* restrict a3 = a[3];
+      if XNN_UNPREDICTABLE(a3 != zero) {
+        a3 = (const int8_t*) ((uintptr_t) a3 + a_offset);
+      }
+      a += 4;
+
+      size_t k = kc;
+      while (k >= 8 * sizeof(int8_t)) {
+        const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+        const int16x8_t vxa0 = vmovl_s8(va0);
+        const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+        const int16x8_t vxa1 = vmovl_s8(va1);
+        const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+        const int16x8_t vxa2 = vmovl_s8(va2);
+        const int8x8_t va3 = vld1_s8(a3); a3 += 8;
+        const int16x8_t vxa3 = vmovl_s8(va3);
+
+        const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb01234567c0 = vmovl_s8(vb01234567c0);
+
+        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
+        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
+        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
+        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
+        vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0);
+        vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0);
+        const int8x8_t vb89ABCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0);
+
+        vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
+        vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
+        vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0);
+        vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0);
+        vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0);
+        vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0);
+        vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0);
+        vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0);
+        const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb01234567c1 = vmovl_s8(vb01234567c1);
+
+        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
+        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
+        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
+        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
+        vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1);
+        vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1);
+        const int8x8_t vb89ABCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1);
+
+        vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
+        vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
+        vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1);
+        vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1);
+        vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1);
+        vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1);
+        vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1);
+        vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1);
+        const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb01234567c2 = vmovl_s8(vb01234567c2);
+
+        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
+        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
+        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
+        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
+        vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2);
+        vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa3), 2);
+        const int8x8_t vb89ABCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb89ABCDEFc2 = vmovl_s8(vb89ABCDEFc2);
+
+        vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
+        vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
+        vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa1), 2);
+        vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa1), 2);
+        vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2);
+        vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2);
+        vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2);
+        vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2);
+        const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb01234567c3 = vmovl_s8(vb01234567c3);
+
+        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
+        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
+        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
+        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
+        vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa3), 3);
+        vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa3), 3);
+        const int8x8_t vb89ABCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb89ABCDEFc3 = vmovl_s8(vb89ABCDEFc3);
+
+        vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);
+        vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);
+        vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa1), 3);
+        vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa1), 3);
+        vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3);
+        vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3);
+        vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3);
+        vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3);
+
+        const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb01234567c4 = vmovl_s8(vb01234567c4);
+
+        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
+        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
+        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
+        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
+        vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa3), 0);
+        vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa3), 0);
+        const int8x8_t vb89ABCDEFc4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb89ABCDEFc4 = vmovl_s8(vb89ABCDEFc4);
+
+        vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
+        vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
+        vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa1), 0);
+        vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa1), 0);
+        vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0);
+        vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0);
+        vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0);
+        vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0);
+        const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb01234567c5 = vmovl_s8(vb01234567c5);
+
+        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
+        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
+        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
+        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
+        vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa3), 1);
+        vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa3), 1);
+        const int8x8_t vb89ABCDEFc5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb89ABCDEFc5 = vmovl_s8(vb89ABCDEFc5);
+
+        vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
+        vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
+        vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa1), 1);
+        vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa1), 1);
+        vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1);
+        vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1);
+        vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1);
+        vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1);
+        const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb01234567c6 = vmovl_s8(vb01234567c6);
+
+        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
+        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
+        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
+        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
+        vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa3), 2);
+        vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa3), 2);
+        const int8x8_t vb89ABCDEFc6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb89ABCDEFc6 = vmovl_s8(vb89ABCDEFc6);
+
+        vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
+        vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
+        vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa1), 2);
+        vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa1), 2);
+        vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2);
+        vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2);
+        vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2);
+        vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2);
+        const int8x8_t vb01234567c7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb01234567c7 = vmovl_s8(vb01234567c7);
+
+        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
+        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
+        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa1), 3);
+        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa1), 3);
+        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa2), 3);
+        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa2), 3);
+        vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa3), 3);
+        vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa3), 3);
+        const int8x8_t vb89ABCDEFc7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb89ABCDEFc7 = vmovl_s8(vb89ABCDEFc7);
+
+        vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3);
+        vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3);
+        vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa1), 3);
+        vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa1), 3);
+        vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa2), 3);
+        vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa2), 3);
+        vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa3), 3);
+        vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa3), 3);
+
+        k -= 8 * sizeof(int8_t);
+      }
+      if XNN_UNLIKELY(k != 0) {
+        const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+        const int16x8_t vxa0 = vmovl_s8(va0);
+        const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+        const int16x8_t vxa1 = vmovl_s8(va1);
+        const int8x8_t va2 = vld1_s8(a2); a2 = (const int8_t*) ((uintptr_t) a2 + k);
+        const int16x8_t vxa2 = vmovl_s8(va2);
+        const int8x8_t va3 = vld1_s8(a3); a3 = (const int8_t*) ((uintptr_t) a3 + k);
+        const int16x8_t vxa3 = vmovl_s8(va3);
+
+        const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb01234567c0 = vmovl_s8(vb01234567c0);
+        const int8x8_t vb89ABCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0);
+
+        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+        vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
+        vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
+        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
+        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
+        vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0);
+        vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0);
+        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
+        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
+        vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0);
+        vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0);
+        vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0);
+        vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0);
+        vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0);
+        vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0);
+
+        if (k >= 2 * sizeof(int8_t)) {
+          const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int16x8_t vxb01234567c1 = vmovl_s8(vb01234567c1);
+          const int8x8_t vb89ABCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1);
+
+          vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+          vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+          vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
+          vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
+          vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
+          vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
+          vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1);
+          vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1);
+          vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
+          vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
+          vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1);
+          vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1);
+          vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1);
+          vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1);
+          vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1);
+          vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1);
+
+          if (k > 2 * sizeof(int8_t)) {
+            const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            const int16x8_t vxb01234567c2 = vmovl_s8(vb01234567c2);
+            const int8x8_t vb89ABCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            const int16x8_t vxb89ABCDEFc2 = vmovl_s8(vb89ABCDEFc2);
+
+            vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+            vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+            vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
+            vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
+            vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
+            vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
+            vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa1), 2);
+            vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa1), 2);
+            vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
+            vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
+            vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2);
+            vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2);
+            vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2);
+            vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa3), 2);
+            vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2);
+            vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2);
+
+            if (k >= 4 * sizeof(int8_t)) {
+              const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+              const int16x8_t vxb01234567c3 = vmovl_s8(vb01234567c3);
+              const int8x8_t vb89ABCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+              const int16x8_t vxb89ABCDEFc3 = vmovl_s8(vb89ABCDEFc3);
+
+              vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+              vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+              vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);
+              vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);
+              vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
+              vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
+              vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa1), 3);
+              vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa1), 3);
+              vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
+              vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
+              vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3);
+              vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3);
+              vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa3), 3);
+              vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa3), 3);
+              vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3);
+              vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3);
+
+              if (k > 4 * sizeof(int8_t)) {
+                const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+                const int16x8_t vxb01234567c4 = vmovl_s8(vb01234567c4);
+                const int8x8_t vb89ABCDEFc4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+                const int16x8_t vxb89ABCDEFc4 = vmovl_s8(vb89ABCDEFc4);
+
+                vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+                vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+                vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
+                vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
+                vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
+                vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
+                vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa1), 0);
+                vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa1), 0);
+                vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
+                vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
+                vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0);
+                vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0);
+                vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa3), 0);
+                vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa3), 0);
+                vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0);
+                vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0);
+
+                if (k >= 6 * sizeof(int8_t)) {
+                  const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+                  const int16x8_t vxb01234567c5 = vmovl_s8(vb01234567c5);
+                  const int8x8_t vb89ABCDEFc5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+                  const int16x8_t vxb89ABCDEFc5 = vmovl_s8(vb89ABCDEFc5);
+
+                  vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+                  vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+                  vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
+                  vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
+                  vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
+                  vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
+                  vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa1), 1);
+                  vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa1), 1);
+                  vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
+                  vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
+                  vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1);
+                  vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1);
+                  vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa3), 1);
+                  vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa3), 1);
+                  vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1);
+                  vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1);
+
+                  if (k > 6 * sizeof(int8_t)) {
+                    const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+                    const int16x8_t vxb01234567c6 = vmovl_s8(vb01234567c6);
+                    const int8x8_t vb89ABCDEFc6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+                    const int16x8_t vxb89ABCDEFc6 = vmovl_s8(vb89ABCDEFc6);
+
+                    vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+                    vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+                    vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
+                    vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
+                    vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
+                    vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
+                    vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa1), 2);
+                    vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa1), 2);
+                    vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
+                    vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
+                    vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2);
+                    vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2);
+                    vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa3), 2);
+                    vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa3), 2);
+                    vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2);
+                    vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2);
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+      p -= 4 * sizeof(void*);
+    } while (p != 0);
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+    vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc1x89AB = vqrdmulhq_s32(vacc1x89AB, vmultiplier);
+    vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+    vacc2x89AB = vqrdmulhq_s32(vacc2x89AB, vmultiplier);
+    vacc2xCDEF = vqrdmulhq_s32(vacc2xCDEF, vmultiplier);
+    vacc3x0123 = vqrdmulhq_s32(vacc3x0123, vmultiplier);
+    vacc3x4567 = vqrdmulhq_s32(vacc3x4567, vmultiplier);
+    vacc3x89AB = vqrdmulhq_s32(vacc3x89AB, vmultiplier);
+    vacc3xCDEF = vqrdmulhq_s32(vacc3xCDEF, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+    vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc1x89AB = vsraq_n_s32(vacc1x89AB, vbicq_s32(vacc1x89AB, vzero_shift_mask), 31);
+    vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+    vacc2x89AB = vsraq_n_s32(vacc2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31);
+    vacc2xCDEF = vsraq_n_s32(vacc2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31);
+    vacc3x0123 = vsraq_n_s32(vacc3x0123, vbicq_s32(vacc3x0123, vzero_shift_mask), 31);
+    vacc3x4567 = vsraq_n_s32(vacc3x4567, vbicq_s32(vacc3x4567, vzero_shift_mask), 31);
+    vacc3x89AB = vsraq_n_s32(vacc3x89AB, vbicq_s32(vacc3x89AB, vzero_shift_mask), 31);
+    vacc3xCDEF = vsraq_n_s32(vacc3xCDEF, vbicq_s32(vacc3xCDEF, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_shift);
+    vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+    vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_shift);
+    vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_shift);
+    vacc3x0123 = vrshlq_s32(vacc3x0123, vright_shift);
+    vacc3x4567 = vrshlq_s32(vacc3x4567, vright_shift);
+    vacc3x89AB = vrshlq_s32(vacc3x89AB, vright_shift);
+    vacc3xCDEF = vrshlq_s32(vacc3xCDEF, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
+    const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+    int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
+    int8x16_t vout2x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc2x89ABCDEF);
+    int8x16_t vout3x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc3x01234567), vacc3x89ABCDEF);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567)), voutput_zero_point);
+    const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF)), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+    int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
+    int8x16_t vout2x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc2x89ABCDEF));
+    int8x16_t vout3x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc3x01234567), vqmovn_s16(vacc3x89ABCDEF));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout3x0123456789ABCDEF = vmaxq_s8(vout3x0123456789ABCDEF, voutput_min);
+    vout2x0123456789ABCDEF = vmaxq_s8(vout2x0123456789ABCDEF, voutput_min);
+    vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
+    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+
+    vout3x0123456789ABCDEF = vminq_s8(vout3x0123456789ABCDEF, voutput_max);
+    vout2x0123456789ABCDEF = vminq_s8(vout2x0123456789ABCDEF, voutput_max);
+    vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
+    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+
+    if (nc >= 16) {
+      vst1q_s8(c3 + 0, vout3x0123456789ABCDEF);
+      vst1q_s8(c2 + 0, vout2x0123456789ABCDEF);
+      vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
+      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+
+      c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 16;
+    } else {
+      int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
+      int8x16_t vout2x01234567_3x01234567 = vcombine_s8(vget_low_s8(vout2x0123456789ABCDEF), vget_low_s8(vout3x0123456789ABCDEF));
+      if (nc & 8) {
+        vst1_s8(c3, vget_high_s8(vout2x01234567_3x01234567)); c3 += 8;
+        vst1_s8(c2, vget_low_s8(vout2x01234567_3x01234567)); c2 += 8;
+        vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
+        vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
+        vout2x01234567_3x01234567 = vcombine_s8(vget_high_s8(vout2x0123456789ABCDEF), vget_high_s8(vout3x0123456789ABCDEF));
+        vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
+      }
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c3, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 2); c3 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 0); c2 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c3, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 4); c3 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 0); c2 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c3, vout2x01234567_3x01234567, 8);
+        vst1q_lane_s8(c2, vout2x01234567_3x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/4x16-minmax-neon-mull-addw-dup.c b/src/qs8-igemm/gen/4x16-minmax-neon-mull-addw-dup.c
new file mode 100644
index 0000000..304b2df
--- /dev/null
+++ b/src/qs8-igemm/gen/4x16-minmax-neon-mull-addw-dup.c
@@ -0,0 +1,694 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/neon-mull-addw-dup.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/gemm.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (4 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+  int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    c3 = c2;
+  }
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc1x0123 = vacc0x0123;
+    int32x4_t vacc1x4567 = vacc0x4567;
+    int32x4_t vacc1x89AB = vacc0x89AB;
+    int32x4_t vacc1xCDEF = vacc0xCDEF;
+    int32x4_t vacc2x0123 = vacc0x0123;
+    int32x4_t vacc2x4567 = vacc0x4567;
+    int32x4_t vacc2x89AB = vacc0x89AB;
+    int32x4_t vacc2xCDEF = vacc0xCDEF;
+    int32x4_t vacc3x0123 = vacc0x0123;
+    int32x4_t vacc3x4567 = vacc0x4567;
+    int32x4_t vacc3x89AB = vacc0x89AB;
+    int32x4_t vacc3xCDEF = vacc0xCDEF;
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      const int8_t* restrict a3 = a[3];
+      if XNN_UNPREDICTABLE(a3 != zero) {
+        a3 = (const int8_t*) ((uintptr_t) a3 + a_offset);
+      }
+      a += 4;
+
+      size_t k = kc;
+      while (k >= 8 * sizeof(int8_t)) {
+        const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+        const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+        const int8x8_t va3 = vld1_s8(a3); a3 += 8;
+
+        const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c0));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c0));
+        const int16x8_t vprod1x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va1, 0));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c0));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c0));
+        const int16x8_t vprod2x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va2, 0));
+        vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c0));
+        vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c0));
+        const int16x8_t vprod3x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va3, 0));
+        vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c0));
+        vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c0));
+        const int8x8_t vb89ABCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va0, 0));
+        vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc0));
+        vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc0));
+        const int16x8_t vprod1x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va1, 0));
+        vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc0));
+        vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc0));
+        const int16x8_t vprod2x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va2, 0));
+        vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc0));
+        vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc0));
+        const int16x8_t vprod3x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va3, 0));
+        vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc0));
+        vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc0));
+        const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c1));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c1));
+        const int16x8_t vprod1x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va1, 1));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c1));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c1));
+        const int16x8_t vprod2x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va2, 1));
+        vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c1));
+        vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c1));
+        const int16x8_t vprod3x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va3, 1));
+        vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c1));
+        vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c1));
+        const int8x8_t vb89ABCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va0, 1));
+        vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc1));
+        vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc1));
+        const int16x8_t vprod1x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va1, 1));
+        vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc1));
+        vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc1));
+        const int16x8_t vprod2x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va2, 1));
+        vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc1));
+        vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc1));
+        const int16x8_t vprod3x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va3, 1));
+        vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc1));
+        vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc1));
+        const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va0, 2));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c2));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c2));
+        const int16x8_t vprod1x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va1, 2));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c2));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c2));
+        const int16x8_t vprod2x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va2, 2));
+        vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c2));
+        vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c2));
+        const int16x8_t vprod3x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va3, 2));
+        vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c2));
+        vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c2));
+        const int8x8_t vb89ABCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va0, 2));
+        vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc2));
+        vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc2));
+        const int16x8_t vprod1x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va1, 2));
+        vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc2));
+        vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc2));
+        const int16x8_t vprod2x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va2, 2));
+        vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc2));
+        vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc2));
+        const int16x8_t vprod3x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va3, 2));
+        vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc2));
+        vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc2));
+        const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3));
+        const int16x8_t vprod1x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va1, 3));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c3));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c3));
+        const int16x8_t vprod2x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va2, 3));
+        vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c3));
+        vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c3));
+        const int16x8_t vprod3x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va3, 3));
+        vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c3));
+        vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c3));
+        const int8x8_t vb89ABCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va0, 3));
+        vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc3));
+        vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc3));
+        const int16x8_t vprod1x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va1, 3));
+        vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc3));
+        vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc3));
+        const int16x8_t vprod2x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va2, 3));
+        vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc3));
+        vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc3));
+        const int16x8_t vprod3x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va3, 3));
+        vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc3));
+        vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc3));
+        const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va0, 4));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c4));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c4));
+        const int16x8_t vprod1x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va1, 4));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c4));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c4));
+        const int16x8_t vprod2x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va2, 4));
+        vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c4));
+        vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c4));
+        const int16x8_t vprod3x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va3, 4));
+        vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c4));
+        vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c4));
+        const int8x8_t vb89ABCDEFc4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va0, 4));
+        vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc4));
+        vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc4));
+        const int16x8_t vprod1x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va1, 4));
+        vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc4));
+        vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc4));
+        const int16x8_t vprod2x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va2, 4));
+        vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc4));
+        vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc4));
+        const int16x8_t vprod3x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va3, 4));
+        vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc4));
+        vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc4));
+        const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va0, 5));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c5));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c5));
+        const int16x8_t vprod1x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va1, 5));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c5));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c5));
+        const int16x8_t vprod2x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va2, 5));
+        vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c5));
+        vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c5));
+        const int16x8_t vprod3x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va3, 5));
+        vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c5));
+        vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c5));
+        const int8x8_t vb89ABCDEFc5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va0, 5));
+        vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc5));
+        vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc5));
+        const int16x8_t vprod1x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va1, 5));
+        vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc5));
+        vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc5));
+        const int16x8_t vprod2x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va2, 5));
+        vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc5));
+        vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc5));
+        const int16x8_t vprod3x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va3, 5));
+        vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc5));
+        vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc5));
+        const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va0, 6));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c6));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c6));
+        const int16x8_t vprod1x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va1, 6));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c6));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c6));
+        const int16x8_t vprod2x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va2, 6));
+        vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c6));
+        vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c6));
+        const int16x8_t vprod3x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va3, 6));
+        vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c6));
+        vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c6));
+        const int8x8_t vb89ABCDEFc6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va0, 6));
+        vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc6));
+        vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc6));
+        const int16x8_t vprod1x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va1, 6));
+        vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc6));
+        vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc6));
+        const int16x8_t vprod2x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va2, 6));
+        vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc6));
+        vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc6));
+        const int16x8_t vprod3x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va3, 6));
+        vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc6));
+        vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc6));
+        const int8x8_t vb01234567c7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va0, 7));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c7));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c7));
+        const int16x8_t vprod1x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va1, 7));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c7));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c7));
+        const int16x8_t vprod2x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va2, 7));
+        vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c7));
+        vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c7));
+        const int16x8_t vprod3x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va3, 7));
+        vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c7));
+        vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c7));
+        const int8x8_t vb89ABCDEFc7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x89ABCDEFc7 = vmull_s8(vb89ABCDEFc7, vdup_lane_s8(va0, 7));
+        vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc7));
+        vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc7));
+        const int16x8_t vprod1x89ABCDEFc7 = vmull_s8(vb89ABCDEFc7, vdup_lane_s8(va1, 7));
+        vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc7));
+        vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc7));
+        const int16x8_t vprod2x89ABCDEFc7 = vmull_s8(vb89ABCDEFc7, vdup_lane_s8(va2, 7));
+        vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc7));
+        vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc7));
+        const int16x8_t vprod3x89ABCDEFc7 = vmull_s8(vb89ABCDEFc7, vdup_lane_s8(va3, 7));
+        vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc7));
+        vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc7));
+
+        k -= 8 * sizeof(int8_t);
+      }
+      if XNN_UNLIKELY(k != 0) {
+        const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+        const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+        const int8x8_t va2 = vld1_s8(a2); a2 = (const int8_t*) ((uintptr_t) a2 + k);
+        const int8x8_t va3 = vld1_s8(a3); a3 = (const int8_t*) ((uintptr_t) a3 + k);
+
+        const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c0));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c0));
+        const int16x8_t vprod0x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va0, 0));
+        vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc0));
+        vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc0));
+        const int16x8_t vprod1x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va1, 0));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c0));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c0));
+        const int16x8_t vprod1x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va1, 0));
+        vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc0));
+        vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc0));
+        const int16x8_t vprod2x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va2, 0));
+        vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c0));
+        vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c0));
+        const int16x8_t vprod2x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va2, 0));
+        vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc0));
+        vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc0));
+        const int16x8_t vprod3x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va3, 0));
+        vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c0));
+        vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c0));
+        const int16x8_t vprod3x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va3, 0));
+        vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc0));
+        vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc0));
+
+        if (k >= 2 * sizeof(int8_t)) {
+          const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb89ABCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1));
+          vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c1));
+          vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c1));
+          const int16x8_t vprod0x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va0, 1));
+          vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc1));
+          vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc1));
+          const int16x8_t vprod1x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va1, 1));
+          vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c1));
+          vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c1));
+          const int16x8_t vprod1x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va1, 1));
+          vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc1));
+          vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc1));
+          const int16x8_t vprod2x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va2, 1));
+          vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c1));
+          vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c1));
+          const int16x8_t vprod2x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va2, 1));
+          vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc1));
+          vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc1));
+          const int16x8_t vprod3x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va3, 1));
+          vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c1));
+          vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c1));
+          const int16x8_t vprod3x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va3, 1));
+          vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc1));
+          vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc1));
+
+          if (k > 2 * sizeof(int8_t)) {
+            const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            const int8x8_t vb89ABCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+            const int16x8_t vprod0x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va0, 2));
+            vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c2));
+            vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c2));
+            const int16x8_t vprod0x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va0, 2));
+            vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc2));
+            vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc2));
+            const int16x8_t vprod1x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va1, 2));
+            vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c2));
+            vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c2));
+            const int16x8_t vprod1x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va1, 2));
+            vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc2));
+            vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc2));
+            const int16x8_t vprod2x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va2, 2));
+            vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c2));
+            vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c2));
+            const int16x8_t vprod2x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va2, 2));
+            vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc2));
+            vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc2));
+            const int16x8_t vprod3x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va3, 2));
+            vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c2));
+            vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c2));
+            const int16x8_t vprod3x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va3, 2));
+            vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc2));
+            vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc2));
+
+            if (k >= 4 * sizeof(int8_t)) {
+              const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+              const int8x8_t vb89ABCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+              const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3));
+              vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3));
+              vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3));
+              const int16x8_t vprod0x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va0, 3));
+              vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc3));
+              vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc3));
+              const int16x8_t vprod1x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va1, 3));
+              vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c3));
+              vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c3));
+              const int16x8_t vprod1x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va1, 3));
+              vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc3));
+              vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc3));
+              const int16x8_t vprod2x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va2, 3));
+              vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c3));
+              vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c3));
+              const int16x8_t vprod2x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va2, 3));
+              vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc3));
+              vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc3));
+              const int16x8_t vprod3x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va3, 3));
+              vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c3));
+              vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c3));
+              const int16x8_t vprod3x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va3, 3));
+              vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc3));
+              vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc3));
+
+              if (k > 4 * sizeof(int8_t)) {
+                const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+                const int8x8_t vb89ABCDEFc4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+                const int16x8_t vprod0x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va0, 4));
+                vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c4));
+                vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c4));
+                const int16x8_t vprod0x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va0, 4));
+                vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc4));
+                vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc4));
+                const int16x8_t vprod1x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va1, 4));
+                vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c4));
+                vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c4));
+                const int16x8_t vprod1x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va1, 4));
+                vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc4));
+                vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc4));
+                const int16x8_t vprod2x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va2, 4));
+                vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c4));
+                vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c4));
+                const int16x8_t vprod2x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va2, 4));
+                vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc4));
+                vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc4));
+                const int16x8_t vprod3x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va3, 4));
+                vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c4));
+                vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c4));
+                const int16x8_t vprod3x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va3, 4));
+                vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc4));
+                vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc4));
+
+                if (k >= 6 * sizeof(int8_t)) {
+                  const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+                  const int8x8_t vb89ABCDEFc5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+                  const int16x8_t vprod0x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va0, 5));
+                  vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c5));
+                  vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c5));
+                  const int16x8_t vprod0x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va0, 5));
+                  vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc5));
+                  vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc5));
+                  const int16x8_t vprod1x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va1, 5));
+                  vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c5));
+                  vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c5));
+                  const int16x8_t vprod1x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va1, 5));
+                  vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc5));
+                  vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc5));
+                  const int16x8_t vprod2x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va2, 5));
+                  vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c5));
+                  vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c5));
+                  const int16x8_t vprod2x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va2, 5));
+                  vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc5));
+                  vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc5));
+                  const int16x8_t vprod3x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va3, 5));
+                  vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c5));
+                  vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c5));
+                  const int16x8_t vprod3x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va3, 5));
+                  vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc5));
+                  vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc5));
+
+                  if (k > 6 * sizeof(int8_t)) {
+                    const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+                    const int8x8_t vb89ABCDEFc6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+                    const int16x8_t vprod0x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va0, 6));
+                    vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c6));
+                    vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c6));
+                    const int16x8_t vprod0x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va0, 6));
+                    vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc6));
+                    vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc6));
+                    const int16x8_t vprod1x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va1, 6));
+                    vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c6));
+                    vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c6));
+                    const int16x8_t vprod1x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va1, 6));
+                    vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc6));
+                    vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc6));
+                    const int16x8_t vprod2x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va2, 6));
+                    vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c6));
+                    vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c6));
+                    const int16x8_t vprod2x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va2, 6));
+                    vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc6));
+                    vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc6));
+                    const int16x8_t vprod3x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va3, 6));
+                    vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c6));
+                    vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c6));
+                    const int16x8_t vprod3x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va3, 6));
+                    vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc6));
+                    vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc6));
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+      p -= 4 * sizeof(void*);
+    } while (p != 0);
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+    vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc1x89AB = vqrdmulhq_s32(vacc1x89AB, vmultiplier);
+    vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+    vacc2x89AB = vqrdmulhq_s32(vacc2x89AB, vmultiplier);
+    vacc2xCDEF = vqrdmulhq_s32(vacc2xCDEF, vmultiplier);
+    vacc3x0123 = vqrdmulhq_s32(vacc3x0123, vmultiplier);
+    vacc3x4567 = vqrdmulhq_s32(vacc3x4567, vmultiplier);
+    vacc3x89AB = vqrdmulhq_s32(vacc3x89AB, vmultiplier);
+    vacc3xCDEF = vqrdmulhq_s32(vacc3xCDEF, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+    vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc1x89AB = vsraq_n_s32(vacc1x89AB, vbicq_s32(vacc1x89AB, vzero_shift_mask), 31);
+    vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+    vacc2x89AB = vsraq_n_s32(vacc2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31);
+    vacc2xCDEF = vsraq_n_s32(vacc2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31);
+    vacc3x0123 = vsraq_n_s32(vacc3x0123, vbicq_s32(vacc3x0123, vzero_shift_mask), 31);
+    vacc3x4567 = vsraq_n_s32(vacc3x4567, vbicq_s32(vacc3x4567, vzero_shift_mask), 31);
+    vacc3x89AB = vsraq_n_s32(vacc3x89AB, vbicq_s32(vacc3x89AB, vzero_shift_mask), 31);
+    vacc3xCDEF = vsraq_n_s32(vacc3xCDEF, vbicq_s32(vacc3xCDEF, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_shift);
+    vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+    vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_shift);
+    vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_shift);
+    vacc3x0123 = vrshlq_s32(vacc3x0123, vright_shift);
+    vacc3x4567 = vrshlq_s32(vacc3x4567, vright_shift);
+    vacc3x89AB = vrshlq_s32(vacc3x89AB, vright_shift);
+    vacc3xCDEF = vrshlq_s32(vacc3xCDEF, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
+    const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+    int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
+    int8x16_t vout2x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc2x89ABCDEF);
+    int8x16_t vout3x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc3x01234567), vacc3x89ABCDEF);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567)), voutput_zero_point);
+    const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF)), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+    int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
+    int8x16_t vout2x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc2x89ABCDEF));
+    int8x16_t vout3x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc3x01234567), vqmovn_s16(vacc3x89ABCDEF));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout3x0123456789ABCDEF = vmaxq_s8(vout3x0123456789ABCDEF, voutput_min);
+    vout2x0123456789ABCDEF = vmaxq_s8(vout2x0123456789ABCDEF, voutput_min);
+    vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
+    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+
+    vout3x0123456789ABCDEF = vminq_s8(vout3x0123456789ABCDEF, voutput_max);
+    vout2x0123456789ABCDEF = vminq_s8(vout2x0123456789ABCDEF, voutput_max);
+    vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
+    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+
+    if (nc >= 16) {
+      vst1q_s8(c3 + 0, vout3x0123456789ABCDEF);
+      vst1q_s8(c2 + 0, vout2x0123456789ABCDEF);
+      vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
+      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+
+      c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 16;
+    } else {
+      int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
+      int8x16_t vout2x01234567_3x01234567 = vcombine_s8(vget_low_s8(vout2x0123456789ABCDEF), vget_low_s8(vout3x0123456789ABCDEF));
+      if (nc & 8) {
+        vst1_s8(c3, vget_high_s8(vout2x01234567_3x01234567)); c3 += 8;
+        vst1_s8(c2, vget_low_s8(vout2x01234567_3x01234567)); c2 += 8;
+        vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
+        vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
+        vout2x01234567_3x01234567 = vcombine_s8(vget_high_s8(vout2x0123456789ABCDEF), vget_high_s8(vout3x0123456789ABCDEF));
+        vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
+      }
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c3, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 2); c3 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 0); c2 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c3, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 4); c3 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 0); c2 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c3, vout2x01234567_3x01234567, 8);
+        vst1q_lane_s8(c2, vout2x01234567_3x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/4x16c16-minmax-neon-mlal-padal.c b/src/qs8-igemm/gen/4x16c16-minmax-neon-mlal-padal.c
new file mode 100644
index 0000000..df63cae
--- /dev/null
+++ b/src/qs8-igemm/gen/4x16c16-minmax-neon-mlal-padal.c
@@ -0,0 +1,679 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/c16-neon-mlal-padal.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (4 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 16);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+  int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    c3 = c2;
+  }
+
+  do {
+    int32x4_t vacc0x0 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x1 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x2 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x3 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x4 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x5 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x8 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x9 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x10 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x11 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x12 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x13 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x14 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x15 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc1x0 = vacc0x0;
+    int32x4_t vacc1x1 = vacc0x1;
+    int32x4_t vacc1x2 = vacc0x2;
+    int32x4_t vacc1x3 = vacc0x3;
+    int32x4_t vacc1x4 = vacc0x4;
+    int32x4_t vacc1x5 = vacc0x5;
+    int32x4_t vacc1x6 = vacc0x6;
+    int32x4_t vacc1x7 = vacc0x7;
+    int32x4_t vacc1x8 = vacc0x8;
+    int32x4_t vacc1x9 = vacc0x9;
+    int32x4_t vacc1x10 = vacc0x10;
+    int32x4_t vacc1x11 = vacc0x11;
+    int32x4_t vacc1x12 = vacc0x12;
+    int32x4_t vacc1x13 = vacc0x13;
+    int32x4_t vacc1x14 = vacc0x14;
+    int32x4_t vacc1x15 = vacc0x15;
+    int32x4_t vacc2x0 = vacc0x0;
+    int32x4_t vacc2x1 = vacc0x1;
+    int32x4_t vacc2x2 = vacc0x2;
+    int32x4_t vacc2x3 = vacc0x3;
+    int32x4_t vacc2x4 = vacc0x4;
+    int32x4_t vacc2x5 = vacc0x5;
+    int32x4_t vacc2x6 = vacc0x6;
+    int32x4_t vacc2x7 = vacc0x7;
+    int32x4_t vacc2x8 = vacc0x8;
+    int32x4_t vacc2x9 = vacc0x9;
+    int32x4_t vacc2x10 = vacc0x10;
+    int32x4_t vacc2x11 = vacc0x11;
+    int32x4_t vacc2x12 = vacc0x12;
+    int32x4_t vacc2x13 = vacc0x13;
+    int32x4_t vacc2x14 = vacc0x14;
+    int32x4_t vacc2x15 = vacc0x15;
+    int32x4_t vacc3x0 = vacc0x0;
+    int32x4_t vacc3x1 = vacc0x1;
+    int32x4_t vacc3x2 = vacc0x2;
+    int32x4_t vacc3x3 = vacc0x3;
+    int32x4_t vacc3x4 = vacc0x4;
+    int32x4_t vacc3x5 = vacc0x5;
+    int32x4_t vacc3x6 = vacc0x6;
+    int32x4_t vacc3x7 = vacc0x7;
+    int32x4_t vacc3x8 = vacc0x8;
+    int32x4_t vacc3x9 = vacc0x9;
+    int32x4_t vacc3x10 = vacc0x10;
+    int32x4_t vacc3x11 = vacc0x11;
+    int32x4_t vacc3x12 = vacc0x12;
+    int32x4_t vacc3x13 = vacc0x13;
+    int32x4_t vacc3x14 = vacc0x14;
+    int32x4_t vacc3x15 = vacc0x15;
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      const int8_t* restrict a3 = a[3];
+      if XNN_UNPREDICTABLE(a3 != zero) {
+        a3 = (const int8_t*) ((uintptr_t) a3 + a_offset);
+      }
+      a += 4;
+
+      // KC loop of 16 with up to 15 remainder
+      size_t k = 0;
+      while (k < kc) {
+        const int8x16_t va0 = vld1q_s8(a0); a0 += 16;
+        const int8x16_t va1 = vld1q_s8(a1); a1 += 16;
+        const int8x16_t va2 = vld1q_s8(a2); a2 += 16;
+        const int8x16_t va3 = vld1q_s8(a3); a3 += 16;
+
+        const int8x16_t vb0 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb1 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb2 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb3 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb4 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb5 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb6 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb7 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb8 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb9 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb10 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb11 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb12 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb13 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb14 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb15 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+
+        int16x8_t vprod0x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va0));
+        int16x8_t vprod1x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va1));
+        int16x8_t vprod2x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va2));
+        int16x8_t vprod3x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va3));
+        vprod0x0 = vmlal_s8(vprod0x0, vget_high_s8(vb0), vget_high_s8(va0));
+        vprod1x0 = vmlal_s8(vprod1x0, vget_high_s8(vb0), vget_high_s8(va1));
+        vprod2x0 = vmlal_s8(vprod2x0, vget_high_s8(vb0), vget_high_s8(va2));
+        vprod3x0 = vmlal_s8(vprod3x0, vget_high_s8(vb0), vget_high_s8(va3));
+        vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+        vacc1x0 = vpadalq_s16(vacc1x0, vprod1x0);
+        vacc2x0 = vpadalq_s16(vacc2x0, vprod2x0);
+        vacc3x0 = vpadalq_s16(vacc3x0, vprod3x0);
+        int16x8_t vprod0x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va0));
+        int16x8_t vprod1x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va1));
+        int16x8_t vprod2x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va2));
+        int16x8_t vprod3x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va3));
+        vprod0x1 = vmlal_s8(vprod0x1, vget_high_s8(vb1), vget_high_s8(va0));
+        vprod1x1 = vmlal_s8(vprod1x1, vget_high_s8(vb1), vget_high_s8(va1));
+        vprod2x1 = vmlal_s8(vprod2x1, vget_high_s8(vb1), vget_high_s8(va2));
+        vprod3x1 = vmlal_s8(vprod3x1, vget_high_s8(vb1), vget_high_s8(va3));
+        vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+        vacc1x1 = vpadalq_s16(vacc1x1, vprod1x1);
+        vacc2x1 = vpadalq_s16(vacc2x1, vprod2x1);
+        vacc3x1 = vpadalq_s16(vacc3x1, vprod3x1);
+        int16x8_t vprod0x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va0));
+        int16x8_t vprod1x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va1));
+        int16x8_t vprod2x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va2));
+        int16x8_t vprod3x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va3));
+        vprod0x2 = vmlal_s8(vprod0x2, vget_high_s8(vb2), vget_high_s8(va0));
+        vprod1x2 = vmlal_s8(vprod1x2, vget_high_s8(vb2), vget_high_s8(va1));
+        vprod2x2 = vmlal_s8(vprod2x2, vget_high_s8(vb2), vget_high_s8(va2));
+        vprod3x2 = vmlal_s8(vprod3x2, vget_high_s8(vb2), vget_high_s8(va3));
+        vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+        vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2);
+        vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2);
+        vacc3x2 = vpadalq_s16(vacc3x2, vprod3x2);
+        int16x8_t vprod0x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va0));
+        int16x8_t vprod1x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va1));
+        int16x8_t vprod2x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va2));
+        int16x8_t vprod3x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va3));
+        vprod0x3 = vmlal_s8(vprod0x3, vget_high_s8(vb3), vget_high_s8(va0));
+        vprod1x3 = vmlal_s8(vprod1x3, vget_high_s8(vb3), vget_high_s8(va1));
+        vprod2x3 = vmlal_s8(vprod2x3, vget_high_s8(vb3), vget_high_s8(va2));
+        vprod3x3 = vmlal_s8(vprod3x3, vget_high_s8(vb3), vget_high_s8(va3));
+        vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+        vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3);
+        vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3);
+        vacc3x3 = vpadalq_s16(vacc3x3, vprod3x3);
+        int16x8_t vprod0x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va0));
+        int16x8_t vprod1x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va1));
+        int16x8_t vprod2x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va2));
+        int16x8_t vprod3x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va3));
+        vprod0x4 = vmlal_s8(vprod0x4, vget_high_s8(vb4), vget_high_s8(va0));
+        vprod1x4 = vmlal_s8(vprod1x4, vget_high_s8(vb4), vget_high_s8(va1));
+        vprod2x4 = vmlal_s8(vprod2x4, vget_high_s8(vb4), vget_high_s8(va2));
+        vprod3x4 = vmlal_s8(vprod3x4, vget_high_s8(vb4), vget_high_s8(va3));
+        vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+        vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4);
+        vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4);
+        vacc3x4 = vpadalq_s16(vacc3x4, vprod3x4);
+        int16x8_t vprod0x5 = vmull_s8(vget_low_s8(vb5), vget_low_s8(va0));
+        int16x8_t vprod1x5 = vmull_s8(vget_low_s8(vb5), vget_low_s8(va1));
+        int16x8_t vprod2x5 = vmull_s8(vget_low_s8(vb5), vget_low_s8(va2));
+        int16x8_t vprod3x5 = vmull_s8(vget_low_s8(vb5), vget_low_s8(va3));
+        vprod0x5 = vmlal_s8(vprod0x5, vget_high_s8(vb5), vget_high_s8(va0));
+        vprod1x5 = vmlal_s8(vprod1x5, vget_high_s8(vb5), vget_high_s8(va1));
+        vprod2x5 = vmlal_s8(vprod2x5, vget_high_s8(vb5), vget_high_s8(va2));
+        vprod3x5 = vmlal_s8(vprod3x5, vget_high_s8(vb5), vget_high_s8(va3));
+        vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+        vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5);
+        vacc2x5 = vpadalq_s16(vacc2x5, vprod2x5);
+        vacc3x5 = vpadalq_s16(vacc3x5, vprod3x5);
+        int16x8_t vprod0x6 = vmull_s8(vget_low_s8(vb6), vget_low_s8(va0));
+        int16x8_t vprod1x6 = vmull_s8(vget_low_s8(vb6), vget_low_s8(va1));
+        int16x8_t vprod2x6 = vmull_s8(vget_low_s8(vb6), vget_low_s8(va2));
+        int16x8_t vprod3x6 = vmull_s8(vget_low_s8(vb6), vget_low_s8(va3));
+        vprod0x6 = vmlal_s8(vprod0x6, vget_high_s8(vb6), vget_high_s8(va0));
+        vprod1x6 = vmlal_s8(vprod1x6, vget_high_s8(vb6), vget_high_s8(va1));
+        vprod2x6 = vmlal_s8(vprod2x6, vget_high_s8(vb6), vget_high_s8(va2));
+        vprod3x6 = vmlal_s8(vprod3x6, vget_high_s8(vb6), vget_high_s8(va3));
+        vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+        vacc1x6 = vpadalq_s16(vacc1x6, vprod1x6);
+        vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6);
+        vacc3x6 = vpadalq_s16(vacc3x6, vprod3x6);
+        int16x8_t vprod0x7 = vmull_s8(vget_low_s8(vb7), vget_low_s8(va0));
+        int16x8_t vprod1x7 = vmull_s8(vget_low_s8(vb7), vget_low_s8(va1));
+        int16x8_t vprod2x7 = vmull_s8(vget_low_s8(vb7), vget_low_s8(va2));
+        int16x8_t vprod3x7 = vmull_s8(vget_low_s8(vb7), vget_low_s8(va3));
+        vprod0x7 = vmlal_s8(vprod0x7, vget_high_s8(vb7), vget_high_s8(va0));
+        vprod1x7 = vmlal_s8(vprod1x7, vget_high_s8(vb7), vget_high_s8(va1));
+        vprod2x7 = vmlal_s8(vprod2x7, vget_high_s8(vb7), vget_high_s8(va2));
+        vprod3x7 = vmlal_s8(vprod3x7, vget_high_s8(vb7), vget_high_s8(va3));
+        vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+        vacc1x7 = vpadalq_s16(vacc1x7, vprod1x7);
+        vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7);
+        vacc3x7 = vpadalq_s16(vacc3x7, vprod3x7);
+        int16x8_t vprod0x8 = vmull_s8(vget_low_s8(vb8), vget_low_s8(va0));
+        int16x8_t vprod1x8 = vmull_s8(vget_low_s8(vb8), vget_low_s8(va1));
+        int16x8_t vprod2x8 = vmull_s8(vget_low_s8(vb8), vget_low_s8(va2));
+        int16x8_t vprod3x8 = vmull_s8(vget_low_s8(vb8), vget_low_s8(va3));
+        vprod0x8 = vmlal_s8(vprod0x8, vget_high_s8(vb8), vget_high_s8(va0));
+        vprod1x8 = vmlal_s8(vprod1x8, vget_high_s8(vb8), vget_high_s8(va1));
+        vprod2x8 = vmlal_s8(vprod2x8, vget_high_s8(vb8), vget_high_s8(va2));
+        vprod3x8 = vmlal_s8(vprod3x8, vget_high_s8(vb8), vget_high_s8(va3));
+        vacc0x8 = vpadalq_s16(vacc0x8, vprod0x8);
+        vacc1x8 = vpadalq_s16(vacc1x8, vprod1x8);
+        vacc2x8 = vpadalq_s16(vacc2x8, vprod2x8);
+        vacc3x8 = vpadalq_s16(vacc3x8, vprod3x8);
+        int16x8_t vprod0x9 = vmull_s8(vget_low_s8(vb9), vget_low_s8(va0));
+        int16x8_t vprod1x9 = vmull_s8(vget_low_s8(vb9), vget_low_s8(va1));
+        int16x8_t vprod2x9 = vmull_s8(vget_low_s8(vb9), vget_low_s8(va2));
+        int16x8_t vprod3x9 = vmull_s8(vget_low_s8(vb9), vget_low_s8(va3));
+        vprod0x9 = vmlal_s8(vprod0x9, vget_high_s8(vb9), vget_high_s8(va0));
+        vprod1x9 = vmlal_s8(vprod1x9, vget_high_s8(vb9), vget_high_s8(va1));
+        vprod2x9 = vmlal_s8(vprod2x9, vget_high_s8(vb9), vget_high_s8(va2));
+        vprod3x9 = vmlal_s8(vprod3x9, vget_high_s8(vb9), vget_high_s8(va3));
+        vacc0x9 = vpadalq_s16(vacc0x9, vprod0x9);
+        vacc1x9 = vpadalq_s16(vacc1x9, vprod1x9);
+        vacc2x9 = vpadalq_s16(vacc2x9, vprod2x9);
+        vacc3x9 = vpadalq_s16(vacc3x9, vprod3x9);
+        int16x8_t vprod0x10 = vmull_s8(vget_low_s8(vb10), vget_low_s8(va0));
+        int16x8_t vprod1x10 = vmull_s8(vget_low_s8(vb10), vget_low_s8(va1));
+        int16x8_t vprod2x10 = vmull_s8(vget_low_s8(vb10), vget_low_s8(va2));
+        int16x8_t vprod3x10 = vmull_s8(vget_low_s8(vb10), vget_low_s8(va3));
+        vprod0x10 = vmlal_s8(vprod0x10, vget_high_s8(vb10), vget_high_s8(va0));
+        vprod1x10 = vmlal_s8(vprod1x10, vget_high_s8(vb10), vget_high_s8(va1));
+        vprod2x10 = vmlal_s8(vprod2x10, vget_high_s8(vb10), vget_high_s8(va2));
+        vprod3x10 = vmlal_s8(vprod3x10, vget_high_s8(vb10), vget_high_s8(va3));
+        vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10);
+        vacc1x10 = vpadalq_s16(vacc1x10, vprod1x10);
+        vacc2x10 = vpadalq_s16(vacc2x10, vprod2x10);
+        vacc3x10 = vpadalq_s16(vacc3x10, vprod3x10);
+        int16x8_t vprod0x11 = vmull_s8(vget_low_s8(vb11), vget_low_s8(va0));
+        int16x8_t vprod1x11 = vmull_s8(vget_low_s8(vb11), vget_low_s8(va1));
+        int16x8_t vprod2x11 = vmull_s8(vget_low_s8(vb11), vget_low_s8(va2));
+        int16x8_t vprod3x11 = vmull_s8(vget_low_s8(vb11), vget_low_s8(va3));
+        vprod0x11 = vmlal_s8(vprod0x11, vget_high_s8(vb11), vget_high_s8(va0));
+        vprod1x11 = vmlal_s8(vprod1x11, vget_high_s8(vb11), vget_high_s8(va1));
+        vprod2x11 = vmlal_s8(vprod2x11, vget_high_s8(vb11), vget_high_s8(va2));
+        vprod3x11 = vmlal_s8(vprod3x11, vget_high_s8(vb11), vget_high_s8(va3));
+        vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11);
+        vacc1x11 = vpadalq_s16(vacc1x11, vprod1x11);
+        vacc2x11 = vpadalq_s16(vacc2x11, vprod2x11);
+        vacc3x11 = vpadalq_s16(vacc3x11, vprod3x11);
+        int16x8_t vprod0x12 = vmull_s8(vget_low_s8(vb12), vget_low_s8(va0));
+        int16x8_t vprod1x12 = vmull_s8(vget_low_s8(vb12), vget_low_s8(va1));
+        int16x8_t vprod2x12 = vmull_s8(vget_low_s8(vb12), vget_low_s8(va2));
+        int16x8_t vprod3x12 = vmull_s8(vget_low_s8(vb12), vget_low_s8(va3));
+        vprod0x12 = vmlal_s8(vprod0x12, vget_high_s8(vb12), vget_high_s8(va0));
+        vprod1x12 = vmlal_s8(vprod1x12, vget_high_s8(vb12), vget_high_s8(va1));
+        vprod2x12 = vmlal_s8(vprod2x12, vget_high_s8(vb12), vget_high_s8(va2));
+        vprod3x12 = vmlal_s8(vprod3x12, vget_high_s8(vb12), vget_high_s8(va3));
+        vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12);
+        vacc1x12 = vpadalq_s16(vacc1x12, vprod1x12);
+        vacc2x12 = vpadalq_s16(vacc2x12, vprod2x12);
+        vacc3x12 = vpadalq_s16(vacc3x12, vprod3x12);
+        int16x8_t vprod0x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va0));
+        int16x8_t vprod1x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va1));
+        int16x8_t vprod2x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va2));
+        int16x8_t vprod3x13 = vmull_s8(vget_low_s8(vb13), vget_low_s8(va3));
+        vprod0x13 = vmlal_s8(vprod0x13, vget_high_s8(vb13), vget_high_s8(va0));
+        vprod1x13 = vmlal_s8(vprod1x13, vget_high_s8(vb13), vget_high_s8(va1));
+        vprod2x13 = vmlal_s8(vprod2x13, vget_high_s8(vb13), vget_high_s8(va2));
+        vprod3x13 = vmlal_s8(vprod3x13, vget_high_s8(vb13), vget_high_s8(va3));
+        vacc0x13 = vpadalq_s16(vacc0x13, vprod0x13);
+        vacc1x13 = vpadalq_s16(vacc1x13, vprod1x13);
+        vacc2x13 = vpadalq_s16(vacc2x13, vprod2x13);
+        vacc3x13 = vpadalq_s16(vacc3x13, vprod3x13);
+        int16x8_t vprod0x14 = vmull_s8(vget_low_s8(vb14), vget_low_s8(va0));
+        int16x8_t vprod1x14 = vmull_s8(vget_low_s8(vb14), vget_low_s8(va1));
+        int16x8_t vprod2x14 = vmull_s8(vget_low_s8(vb14), vget_low_s8(va2));
+        int16x8_t vprod3x14 = vmull_s8(vget_low_s8(vb14), vget_low_s8(va3));
+        vprod0x14 = vmlal_s8(vprod0x14, vget_high_s8(vb14), vget_high_s8(va0));
+        vprod1x14 = vmlal_s8(vprod1x14, vget_high_s8(vb14), vget_high_s8(va1));
+        vprod2x14 = vmlal_s8(vprod2x14, vget_high_s8(vb14), vget_high_s8(va2));
+        vprod3x14 = vmlal_s8(vprod3x14, vget_high_s8(vb14), vget_high_s8(va3));
+        vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14);
+        vacc1x14 = vpadalq_s16(vacc1x14, vprod1x14);
+        vacc2x14 = vpadalq_s16(vacc2x14, vprod2x14);
+        vacc3x14 = vpadalq_s16(vacc3x14, vprod3x14);
+        int16x8_t vprod0x15 = vmull_s8(vget_low_s8(vb15), vget_low_s8(va0));
+        int16x8_t vprod1x15 = vmull_s8(vget_low_s8(vb15), vget_low_s8(va1));
+        int16x8_t vprod2x15 = vmull_s8(vget_low_s8(vb15), vget_low_s8(va2));
+        int16x8_t vprod3x15 = vmull_s8(vget_low_s8(vb15), vget_low_s8(va3));
+        vprod0x15 = vmlal_s8(vprod0x15, vget_high_s8(vb15), vget_high_s8(va0));
+        vprod1x15 = vmlal_s8(vprod1x15, vget_high_s8(vb15), vget_high_s8(va1));
+        vprod2x15 = vmlal_s8(vprod2x15, vget_high_s8(vb15), vget_high_s8(va2));
+        vprod3x15 = vmlal_s8(vprod3x15, vget_high_s8(vb15), vget_high_s8(va3));
+        vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15);
+        vacc1x15 = vpadalq_s16(vacc1x15, vprod1x15);
+        vacc2x15 = vpadalq_s16(vacc2x15, vprod2x15);
+        vacc3x15 = vpadalq_s16(vacc3x15, vprod3x15);
+
+        k += 16 * sizeof(int8_t);
+      }
+
+      p -= 4 * sizeof(void*);
+    } while (p != 0);
+
+#if XNN_ARCH_ARM64
+    const int32x4_t vsum0x01 = vpaddq_s32(vacc0x0, vacc0x1);
+    const int32x4_t vsum0x23 = vpaddq_s32(vacc0x2, vacc0x3);
+    const int32x4_t vsum0x45 = vpaddq_s32(vacc0x4, vacc0x5);
+    const int32x4_t vsum0x67 = vpaddq_s32(vacc0x6, vacc0x7);
+    const int32x4_t vsum0x89 = vpaddq_s32(vacc0x8, vacc0x9);
+    const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11);
+    const int32x4_t vsum0xCD = vpaddq_s32(vacc0x12, vacc0x13);
+    const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15);
+    const int32x4_t vsum1x01 = vpaddq_s32(vacc1x0, vacc1x1);
+    const int32x4_t vsum1x23 = vpaddq_s32(vacc1x2, vacc1x3);
+    const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5);
+    const int32x4_t vsum1x67 = vpaddq_s32(vacc1x6, vacc1x7);
+    const int32x4_t vsum1x89 = vpaddq_s32(vacc1x8, vacc1x9);
+    const int32x4_t vsum1xAB = vpaddq_s32(vacc1x10, vacc1x11);
+    const int32x4_t vsum1xCD = vpaddq_s32(vacc1x12, vacc1x13);
+    const int32x4_t vsum1xEF = vpaddq_s32(vacc1x14, vacc1x15);
+    const int32x4_t vsum2x01 = vpaddq_s32(vacc2x0, vacc2x1);
+    const int32x4_t vsum2x23 = vpaddq_s32(vacc2x2, vacc2x3);
+    const int32x4_t vsum2x45 = vpaddq_s32(vacc2x4, vacc2x5);
+    const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7);
+    const int32x4_t vsum2x89 = vpaddq_s32(vacc2x8, vacc2x9);
+    const int32x4_t vsum2xAB = vpaddq_s32(vacc2x10, vacc2x11);
+    const int32x4_t vsum2xCD = vpaddq_s32(vacc2x12, vacc2x13);
+    const int32x4_t vsum2xEF = vpaddq_s32(vacc2x14, vacc2x15);
+    const int32x4_t vsum3x01 = vpaddq_s32(vacc3x0, vacc3x1);
+    const int32x4_t vsum3x23 = vpaddq_s32(vacc3x2, vacc3x3);
+    const int32x4_t vsum3x45 = vpaddq_s32(vacc3x4, vacc3x5);
+    const int32x4_t vsum3x67 = vpaddq_s32(vacc3x6, vacc3x7);
+    const int32x4_t vsum3x89 = vpaddq_s32(vacc3x8, vacc3x9);
+    const int32x4_t vsum3xAB = vpaddq_s32(vacc3x10, vacc3x11);
+    const int32x4_t vsum3xCD = vpaddq_s32(vacc3x12, vacc3x13);
+    const int32x4_t vsum3xEF = vpaddq_s32(vacc3x14, vacc3x15);
+    int32x4_t vacc0x0123 = vpaddq_s32(vsum0x01, vsum0x23);
+    int32x4_t vacc0x4567 = vpaddq_s32(vsum0x45, vsum0x67);
+    int32x4_t vacc0x89AB = vpaddq_s32(vsum0x89, vsum0xAB);
+    int32x4_t vacc0xCDEF = vpaddq_s32(vsum0xCD, vsum0xEF);
+    int32x4_t vacc1x0123 = vpaddq_s32(vsum1x01, vsum1x23);
+    int32x4_t vacc1x4567 = vpaddq_s32(vsum1x45, vsum1x67);
+    int32x4_t vacc1x89AB = vpaddq_s32(vsum1x89, vsum1xAB);
+    int32x4_t vacc1xCDEF = vpaddq_s32(vsum1xCD, vsum1xEF);
+    int32x4_t vacc2x0123 = vpaddq_s32(vsum2x01, vsum2x23);
+    int32x4_t vacc2x4567 = vpaddq_s32(vsum2x45, vsum2x67);
+    int32x4_t vacc2x89AB = vpaddq_s32(vsum2x89, vsum2xAB);
+    int32x4_t vacc2xCDEF = vpaddq_s32(vsum2xCD, vsum2xEF);
+    int32x4_t vacc3x0123 = vpaddq_s32(vsum3x01, vsum3x23);
+    int32x4_t vacc3x4567 = vpaddq_s32(vsum3x45, vsum3x67);
+    int32x4_t vacc3x89AB = vpaddq_s32(vsum3x89, vsum3xAB);
+    int32x4_t vacc3xCDEF = vpaddq_s32(vsum3xCD, vsum3xEF);
+#else
+    const int32x2_t vpsum0x0 = vadd_s32(vget_low_s32(vacc0x0), vget_high_s32(vacc0x0));
+    const int32x2_t vpsum0x1 = vadd_s32(vget_low_s32(vacc0x1), vget_high_s32(vacc0x1));
+    const int32x2_t vpsum0x2 = vadd_s32(vget_low_s32(vacc0x2), vget_high_s32(vacc0x2));
+    const int32x2_t vpsum0x3 = vadd_s32(vget_low_s32(vacc0x3), vget_high_s32(vacc0x3));
+    const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1);
+    const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3);
+    int32x4_t vacc0x0123 = vcombine_s32(vsum0x01, vsum0x23 );
+    const int32x2_t vpsum0x4 = vadd_s32(vget_low_s32(vacc0x4), vget_high_s32(vacc0x4));
+    const int32x2_t vpsum0x5 = vadd_s32(vget_low_s32(vacc0x5), vget_high_s32(vacc0x5));
+    const int32x2_t vpsum0x6 = vadd_s32(vget_low_s32(vacc0x6), vget_high_s32(vacc0x6));
+    const int32x2_t vpsum0x7 = vadd_s32(vget_low_s32(vacc0x7), vget_high_s32(vacc0x7));
+    const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5);
+    const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7);
+    int32x4_t vacc0x4567 = vcombine_s32(vsum0x45, vsum0x67 );
+    const int32x2_t vpsum0x8 = vadd_s32(vget_low_s32(vacc0x8), vget_high_s32(vacc0x8));
+    const int32x2_t vpsum0x9 = vadd_s32(vget_low_s32(vacc0x9), vget_high_s32(vacc0x9));
+    const int32x2_t vpsum0xA = vadd_s32(vget_low_s32(vacc0x10), vget_high_s32(vacc0x10));
+    const int32x2_t vpsum0xB = vadd_s32(vget_low_s32(vacc0x11), vget_high_s32(vacc0x11));
+    const int32x2_t vsum0x89 = vpadd_s32(vpsum0x8, vpsum0x9);
+    const int32x2_t vsum0xAB = vpadd_s32(vpsum0xA, vpsum0xB);
+    int32x4_t vacc0x89AB = vcombine_s32(vsum0x89, vsum0xAB );
+    const int32x2_t vpsum0xC = vadd_s32(vget_low_s32(vacc0x12), vget_high_s32(vacc0x12));
+    const int32x2_t vpsum0xD = vadd_s32(vget_low_s32(vacc0x13), vget_high_s32(vacc0x13));
+    const int32x2_t vpsum0xE = vadd_s32(vget_low_s32(vacc0x14), vget_high_s32(vacc0x14));
+    const int32x2_t vpsum0xF = vadd_s32(vget_low_s32(vacc0x15), vget_high_s32(vacc0x15));
+    const int32x2_t vsum0xCD = vpadd_s32(vpsum0xC, vpsum0xD);
+    const int32x2_t vsum0xEF = vpadd_s32(vpsum0xE, vpsum0xF);
+    int32x4_t vacc0xCDEF = vcombine_s32(vsum0xCD, vsum0xEF );
+    const int32x2_t vpsum1x0 = vadd_s32(vget_low_s32(vacc1x0), vget_high_s32(vacc1x0));
+    const int32x2_t vpsum1x1 = vadd_s32(vget_low_s32(vacc1x1), vget_high_s32(vacc1x1));
+    const int32x2_t vpsum1x2 = vadd_s32(vget_low_s32(vacc1x2), vget_high_s32(vacc1x2));
+    const int32x2_t vpsum1x3 = vadd_s32(vget_low_s32(vacc1x3), vget_high_s32(vacc1x3));
+    const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1);
+    const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3);
+    int32x4_t vacc1x0123 = vcombine_s32(vsum1x01, vsum1x23 );
+    const int32x2_t vpsum1x4 = vadd_s32(vget_low_s32(vacc1x4), vget_high_s32(vacc1x4));
+    const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5));
+    const int32x2_t vpsum1x6 = vadd_s32(vget_low_s32(vacc1x6), vget_high_s32(vacc1x6));
+    const int32x2_t vpsum1x7 = vadd_s32(vget_low_s32(vacc1x7), vget_high_s32(vacc1x7));
+    const int32x2_t vsum1x45 = vpadd_s32(vpsum1x4, vpsum1x5);
+    const int32x2_t vsum1x67 = vpadd_s32(vpsum1x6, vpsum1x7);
+    int32x4_t vacc1x4567 = vcombine_s32(vsum1x45, vsum1x67 );
+    const int32x2_t vpsum1x8 = vadd_s32(vget_low_s32(vacc1x8), vget_high_s32(vacc1x8));
+    const int32x2_t vpsum1x9 = vadd_s32(vget_low_s32(vacc1x9), vget_high_s32(vacc1x9));
+    const int32x2_t vpsum1xA = vadd_s32(vget_low_s32(vacc1x10), vget_high_s32(vacc1x10));
+    const int32x2_t vpsum1xB = vadd_s32(vget_low_s32(vacc1x11), vget_high_s32(vacc1x11));
+    const int32x2_t vsum1x89 = vpadd_s32(vpsum1x8, vpsum1x9);
+    const int32x2_t vsum1xAB = vpadd_s32(vpsum1xA, vpsum1xB);
+    int32x4_t vacc1x89AB = vcombine_s32(vsum1x89, vsum1xAB );
+    const int32x2_t vpsum1xC = vadd_s32(vget_low_s32(vacc1x12), vget_high_s32(vacc1x12));
+    const int32x2_t vpsum1xD = vadd_s32(vget_low_s32(vacc1x13), vget_high_s32(vacc1x13));
+    const int32x2_t vpsum1xE = vadd_s32(vget_low_s32(vacc1x14), vget_high_s32(vacc1x14));
+    const int32x2_t vpsum1xF = vadd_s32(vget_low_s32(vacc1x15), vget_high_s32(vacc1x15));
+    const int32x2_t vsum1xCD = vpadd_s32(vpsum1xC, vpsum1xD);
+    const int32x2_t vsum1xEF = vpadd_s32(vpsum1xE, vpsum1xF);
+    int32x4_t vacc1xCDEF = vcombine_s32(vsum1xCD, vsum1xEF );
+    const int32x2_t vpsum2x0 = vadd_s32(vget_low_s32(vacc2x0), vget_high_s32(vacc2x0));
+    const int32x2_t vpsum2x1 = vadd_s32(vget_low_s32(vacc2x1), vget_high_s32(vacc2x1));
+    const int32x2_t vpsum2x2 = vadd_s32(vget_low_s32(vacc2x2), vget_high_s32(vacc2x2));
+    const int32x2_t vpsum2x3 = vadd_s32(vget_low_s32(vacc2x3), vget_high_s32(vacc2x3));
+    const int32x2_t vsum2x01 = vpadd_s32(vpsum2x0, vpsum2x1);
+    const int32x2_t vsum2x23 = vpadd_s32(vpsum2x2, vpsum2x3);
+    int32x4_t vacc2x0123 = vcombine_s32(vsum2x01, vsum2x23 );
+    const int32x2_t vpsum2x4 = vadd_s32(vget_low_s32(vacc2x4), vget_high_s32(vacc2x4));
+    const int32x2_t vpsum2x5 = vadd_s32(vget_low_s32(vacc2x5), vget_high_s32(vacc2x5));
+    const int32x2_t vpsum2x6 = vadd_s32(vget_low_s32(vacc2x6), vget_high_s32(vacc2x6));
+    const int32x2_t vpsum2x7 = vadd_s32(vget_low_s32(vacc2x7), vget_high_s32(vacc2x7));
+    const int32x2_t vsum2x45 = vpadd_s32(vpsum2x4, vpsum2x5);
+    const int32x2_t vsum2x67 = vpadd_s32(vpsum2x6, vpsum2x7);
+    int32x4_t vacc2x4567 = vcombine_s32(vsum2x45, vsum2x67 );
+    const int32x2_t vpsum2x8 = vadd_s32(vget_low_s32(vacc2x8), vget_high_s32(vacc2x8));
+    const int32x2_t vpsum2x9 = vadd_s32(vget_low_s32(vacc2x9), vget_high_s32(vacc2x9));
+    const int32x2_t vpsum2xA = vadd_s32(vget_low_s32(vacc2x10), vget_high_s32(vacc2x10));
+    const int32x2_t vpsum2xB = vadd_s32(vget_low_s32(vacc2x11), vget_high_s32(vacc2x11));
+    const int32x2_t vsum2x89 = vpadd_s32(vpsum2x8, vpsum2x9);
+    const int32x2_t vsum2xAB = vpadd_s32(vpsum2xA, vpsum2xB);
+    int32x4_t vacc2x89AB = vcombine_s32(vsum2x89, vsum2xAB );
+    const int32x2_t vpsum2xC = vadd_s32(vget_low_s32(vacc2x12), vget_high_s32(vacc2x12));
+    const int32x2_t vpsum2xD = vadd_s32(vget_low_s32(vacc2x13), vget_high_s32(vacc2x13));
+    const int32x2_t vpsum2xE = vadd_s32(vget_low_s32(vacc2x14), vget_high_s32(vacc2x14));
+    const int32x2_t vpsum2xF = vadd_s32(vget_low_s32(vacc2x15), vget_high_s32(vacc2x15));
+    const int32x2_t vsum2xCD = vpadd_s32(vpsum2xC, vpsum2xD);
+    const int32x2_t vsum2xEF = vpadd_s32(vpsum2xE, vpsum2xF);
+    int32x4_t vacc2xCDEF = vcombine_s32(vsum2xCD, vsum2xEF );
+    const int32x2_t vpsum3x0 = vadd_s32(vget_low_s32(vacc3x0), vget_high_s32(vacc3x0));
+    const int32x2_t vpsum3x1 = vadd_s32(vget_low_s32(vacc3x1), vget_high_s32(vacc3x1));
+    const int32x2_t vpsum3x2 = vadd_s32(vget_low_s32(vacc3x2), vget_high_s32(vacc3x2));
+    const int32x2_t vpsum3x3 = vadd_s32(vget_low_s32(vacc3x3), vget_high_s32(vacc3x3));
+    const int32x2_t vsum3x01 = vpadd_s32(vpsum3x0, vpsum3x1);
+    const int32x2_t vsum3x23 = vpadd_s32(vpsum3x2, vpsum3x3);
+    int32x4_t vacc3x0123 = vcombine_s32(vsum3x01, vsum3x23 );
+    const int32x2_t vpsum3x4 = vadd_s32(vget_low_s32(vacc3x4), vget_high_s32(vacc3x4));
+    const int32x2_t vpsum3x5 = vadd_s32(vget_low_s32(vacc3x5), vget_high_s32(vacc3x5));
+    const int32x2_t vpsum3x6 = vadd_s32(vget_low_s32(vacc3x6), vget_high_s32(vacc3x6));
+    const int32x2_t vpsum3x7 = vadd_s32(vget_low_s32(vacc3x7), vget_high_s32(vacc3x7));
+    const int32x2_t vsum3x45 = vpadd_s32(vpsum3x4, vpsum3x5);
+    const int32x2_t vsum3x67 = vpadd_s32(vpsum3x6, vpsum3x7);
+    int32x4_t vacc3x4567 = vcombine_s32(vsum3x45, vsum3x67 );
+    const int32x2_t vpsum3x8 = vadd_s32(vget_low_s32(vacc3x8), vget_high_s32(vacc3x8));
+    const int32x2_t vpsum3x9 = vadd_s32(vget_low_s32(vacc3x9), vget_high_s32(vacc3x9));
+    const int32x2_t vpsum3xA = vadd_s32(vget_low_s32(vacc3x10), vget_high_s32(vacc3x10));
+    const int32x2_t vpsum3xB = vadd_s32(vget_low_s32(vacc3x11), vget_high_s32(vacc3x11));
+    const int32x2_t vsum3x89 = vpadd_s32(vpsum3x8, vpsum3x9);
+    const int32x2_t vsum3xAB = vpadd_s32(vpsum3xA, vpsum3xB);
+    int32x4_t vacc3x89AB = vcombine_s32(vsum3x89, vsum3xAB );
+    const int32x2_t vpsum3xC = vadd_s32(vget_low_s32(vacc3x12), vget_high_s32(vacc3x12));
+    const int32x2_t vpsum3xD = vadd_s32(vget_low_s32(vacc3x13), vget_high_s32(vacc3x13));
+    const int32x2_t vpsum3xE = vadd_s32(vget_low_s32(vacc3x14), vget_high_s32(vacc3x14));
+    const int32x2_t vpsum3xF = vadd_s32(vget_low_s32(vacc3x15), vget_high_s32(vacc3x15));
+    const int32x2_t vsum3xCD = vpadd_s32(vpsum3xC, vpsum3xD);
+    const int32x2_t vsum3xEF = vpadd_s32(vpsum3xE, vpsum3xF);
+    int32x4_t vacc3xCDEF = vcombine_s32(vsum3xCD, vsum3xEF );
+#endif
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+    vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc1x89AB = vqrdmulhq_s32(vacc1x89AB, vmultiplier);
+    vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+    vacc2x89AB = vqrdmulhq_s32(vacc2x89AB, vmultiplier);
+    vacc2xCDEF = vqrdmulhq_s32(vacc2xCDEF, vmultiplier);
+    vacc3x0123 = vqrdmulhq_s32(vacc3x0123, vmultiplier);
+    vacc3x4567 = vqrdmulhq_s32(vacc3x4567, vmultiplier);
+    vacc3x89AB = vqrdmulhq_s32(vacc3x89AB, vmultiplier);
+    vacc3xCDEF = vqrdmulhq_s32(vacc3xCDEF, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+    vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc1x89AB = vsraq_n_s32(vacc1x89AB, vbicq_s32(vacc1x89AB, vzero_shift_mask), 31);
+    vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+    vacc2x89AB = vsraq_n_s32(vacc2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31);
+    vacc2xCDEF = vsraq_n_s32(vacc2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31);
+    vacc3x0123 = vsraq_n_s32(vacc3x0123, vbicq_s32(vacc3x0123, vzero_shift_mask), 31);
+    vacc3x4567 = vsraq_n_s32(vacc3x4567, vbicq_s32(vacc3x4567, vzero_shift_mask), 31);
+    vacc3x89AB = vsraq_n_s32(vacc3x89AB, vbicq_s32(vacc3x89AB, vzero_shift_mask), 31);
+    vacc3xCDEF = vsraq_n_s32(vacc3xCDEF, vbicq_s32(vacc3xCDEF, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_shift);
+    vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+    vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_shift);
+    vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_shift);
+    vacc3x0123 = vrshlq_s32(vacc3x0123, vright_shift);
+    vacc3x4567 = vrshlq_s32(vacc3x4567, vright_shift);
+    vacc3x89AB = vrshlq_s32(vacc3x89AB, vright_shift);
+    vacc3xCDEF = vrshlq_s32(vacc3xCDEF, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
+    const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), voutput_zero_point);
+    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+    int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
+    int8x16_t vout2x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc2x89ABCDEF);
+    int8x16_t vout3x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc3x01234567), vacc3x89ABCDEF);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567)), voutput_zero_point);
+    const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF)), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+    int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
+    int8x16_t vout2x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc2x89ABCDEF));
+    int8x16_t vout3x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc3x01234567), vqmovn_s16(vacc3x89ABCDEF));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout3x0123456789ABCDEF = vmaxq_s8(vout3x0123456789ABCDEF, voutput_min);
+    vout2x0123456789ABCDEF = vmaxq_s8(vout2x0123456789ABCDEF, voutput_min);
+    vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
+    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+
+    vout3x0123456789ABCDEF = vminq_s8(vout3x0123456789ABCDEF, voutput_max);
+    vout2x0123456789ABCDEF = vminq_s8(vout2x0123456789ABCDEF, voutput_max);
+    vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
+    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+
+    if (nc >= 16) {
+      vst1q_s8(c3 + 0, vout3x0123456789ABCDEF);
+      vst1q_s8(c2 + 0, vout2x0123456789ABCDEF);
+      vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
+      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+
+      c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 16;
+    } else {
+      int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
+      int8x16_t vout2x01234567_3x01234567 = vcombine_s8(vget_low_s8(vout2x0123456789ABCDEF), vget_low_s8(vout3x0123456789ABCDEF));
+      if (nc & 8) {
+        vst1_s8(c3, vget_high_s8(vout2x01234567_3x01234567)); c3 += 8;
+        vst1_s8(c2, vget_low_s8(vout2x01234567_3x01234567)); c2 += 8;
+        vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
+        vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
+        vout2x01234567_3x01234567 = vcombine_s8(vget_high_s8(vout2x0123456789ABCDEF), vget_high_s8(vout3x0123456789ABCDEF));
+        vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
+      }
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c3, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 2); c3 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 0); c2 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c3, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 4); c3 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 0); c2 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c3, vout2x01234567_3x01234567, 8);
+        vst1q_lane_s8(c2, vout2x01234567_3x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/4x16c2-minmax-neon-mlal-padal-dup.c b/src/qs8-igemm/gen/4x16c2-minmax-neon-mlal-padal-dup.c
new file mode 100644
index 0000000..ef26e6a
--- /dev/null
+++ b/src/qs8-igemm/gen/4x16c2-minmax-neon-mlal-padal-dup.c
@@ -0,0 +1,767 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/c2-neon-mull-padal-dup.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (4 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+  int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    c3 = c2;
+  }
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc1x0123 = vacc0x0123;
+    int32x4_t vacc1x4567 = vacc0x4567;
+    int32x4_t vacc1x89AB = vacc0x89AB;
+    int32x4_t vacc1xCDEF = vacc0xCDEF;
+    int32x4_t vacc2x0123 = vacc0x0123;
+    int32x4_t vacc2x4567 = vacc0x4567;
+    int32x4_t vacc2x89AB = vacc0x89AB;
+    int32x4_t vacc2xCDEF = vacc0xCDEF;
+    int32x4_t vacc3x0123 = vacc0x0123;
+    int32x4_t vacc3x4567 = vacc0x4567;
+    int32x4_t vacc3x89AB = vacc0x89AB;
+    int32x4_t vacc3xCDEF = vacc0xCDEF;
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      const int8_t* restrict a3 = a[3];
+      if XNN_UNPREDICTABLE(a3 != zero) {
+        a3 = (const int8_t*) ((uintptr_t) a3 + a_offset);
+      }
+      a += 4;
+
+      size_t k = kc;
+
+      while (k >= 16 * sizeof(int8_t)) {
+        const int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va0x1 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va1x0 = vld1_s8(a1); a1 += 8;
+        const int8x8_t va1x1 = vld1_s8(a1); a1 += 8;
+        const int8x8_t va2x0 = vld1_s8(a2); a2 += 8;
+        const int8x8_t va2x1 = vld1_s8(a2); a2 += 8;
+        const int8x8_t va3x0 = vld1_s8(a3); a3 += 8;
+        const int8x8_t va3x1 = vld1_s8(a3); a3 += 8;
+
+        const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+        int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+        int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0)));
+        int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 0)));
+        const int8x8_t vb0123c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+        vprod1x0123c0 = vmlal_s8(vprod1x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+        vprod2x0123c0 = vmlal_s8(vprod2x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 0)));
+        vprod3x0123c0 = vmlal_s8(vprod3x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 0)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+        vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c0);
+        int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+        int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+        int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0)));
+        int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 0)));
+        const int8x8_t vb4567c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+        vprod1x4567c0 = vmlal_s8(vprod1x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+        vprod2x4567c0 = vmlal_s8(vprod2x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 0)));
+        vprod3x4567c0 = vmlal_s8(vprod3x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 0)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+        vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c0);
+        int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+        int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+        int16x8_t vprod2x89ABc0 = vmull_s8(vb89ABc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0)));
+        int16x8_t vprod3x89ABc0 = vmull_s8(vb89ABc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 0)));
+        const int8x8_t vb89ABc0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x89ABc0 = vmlal_s8(vprod0x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+        vprod1x89ABc0 = vmlal_s8(vprod1x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+        vprod2x89ABc0 = vmlal_s8(vprod2x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 0)));
+        vprod3x89ABc0 = vmlal_s8(vprod3x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 0)));
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+        vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
+        vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0);
+        vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc0);
+        int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+        int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+        int16x8_t vprod2xCDEFc0 = vmull_s8(vbCDEFc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0)));
+        int16x8_t vprod3xCDEFc0 = vmull_s8(vbCDEFc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 0)));
+        const int8x8_t vbCDEFc0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0xCDEFc0 = vmlal_s8(vprod0xCDEFc0, vbCDEFc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+        vprod1xCDEFc0 = vmlal_s8(vprod1xCDEFc0, vbCDEFc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+        vprod2xCDEFc0 = vmlal_s8(vprod2xCDEFc0, vbCDEFc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 0)));
+        vprod3xCDEFc0 = vmlal_s8(vprod3xCDEFc0, vbCDEFc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 0)));
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+        vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
+        vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0);
+        vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc0);
+        int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+        int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+        int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1)));
+        int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 1)));
+        const int8x8_t vb0123c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x0123c1 = vmlal_s8(vprod0x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+        vprod1x0123c1 = vmlal_s8(vprod1x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+        vprod2x0123c1 = vmlal_s8(vprod2x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 1)));
+        vprod3x0123c1 = vmlal_s8(vprod3x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 1)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+        vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c1);
+        int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+        int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+        int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1)));
+        int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 1)));
+        const int8x8_t vb4567c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x4567c1 = vmlal_s8(vprod0x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+        vprod1x4567c1 = vmlal_s8(vprod1x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+        vprod2x4567c1 = vmlal_s8(vprod2x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 1)));
+        vprod3x4567c1 = vmlal_s8(vprod3x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 1)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+        vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c1);
+        int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+        int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+        int16x8_t vprod2x89ABc1 = vmull_s8(vb89ABc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1)));
+        int16x8_t vprod3x89ABc1 = vmull_s8(vb89ABc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 1)));
+        const int8x8_t vb89ABc1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x89ABc1 = vmlal_s8(vprod0x89ABc1, vb89ABc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+        vprod1x89ABc1 = vmlal_s8(vprod1x89ABc1, vb89ABc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+        vprod2x89ABc1 = vmlal_s8(vprod2x89ABc1, vb89ABc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 1)));
+        vprod3x89ABc1 = vmlal_s8(vprod3x89ABc1, vb89ABc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 1)));
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+        vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
+        vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1);
+        vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc1);
+        int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+        int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+        int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1)));
+        int16x8_t vprod3xCDEFc1 = vmull_s8(vbCDEFc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 1)));
+        const int8x8_t vbCDEFc1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0xCDEFc1 = vmlal_s8(vprod0xCDEFc1, vbCDEFc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+        vprod1xCDEFc1 = vmlal_s8(vprod1xCDEFc1, vbCDEFc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+        vprod2xCDEFc1 = vmlal_s8(vprod2xCDEFc1, vbCDEFc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 1)));
+        vprod3xCDEFc1 = vmlal_s8(vprod3xCDEFc1, vbCDEFc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 1)));
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+        vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
+        vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1);
+        vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc1);
+        int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+        int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+        int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 2)));
+        int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 2)));
+        const int8x8_t vb0123c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x0123c2 = vmlal_s8(vprod0x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+        vprod1x0123c2 = vmlal_s8(vprod1x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+        vprod2x0123c2 = vmlal_s8(vprod2x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 2)));
+        vprod3x0123c2 = vmlal_s8(vprod3x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 2)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+        vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c2);
+        int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+        int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+        int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 2)));
+        int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 2)));
+        const int8x8_t vb4567c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x4567c2 = vmlal_s8(vprod0x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+        vprod1x4567c2 = vmlal_s8(vprod1x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+        vprod2x4567c2 = vmlal_s8(vprod2x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 2)));
+        vprod3x4567c2 = vmlal_s8(vprod3x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 2)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+        vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c2);
+        int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+        int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+        int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 2)));
+        int16x8_t vprod3x89ABc2 = vmull_s8(vb89ABc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 2)));
+        const int8x8_t vb89ABc2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x89ABc2 = vmlal_s8(vprod0x89ABc2, vb89ABc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+        vprod1x89ABc2 = vmlal_s8(vprod1x89ABc2, vb89ABc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+        vprod2x89ABc2 = vmlal_s8(vprod2x89ABc2, vb89ABc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 2)));
+        vprod3x89ABc2 = vmlal_s8(vprod3x89ABc2, vb89ABc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 2)));
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+        vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
+        vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2);
+        vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc2);
+        int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+        int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+        int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 2)));
+        int16x8_t vprod3xCDEFc2 = vmull_s8(vbCDEFc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 2)));
+        const int8x8_t vbCDEFc2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0xCDEFc2 = vmlal_s8(vprod0xCDEFc2, vbCDEFc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+        vprod1xCDEFc2 = vmlal_s8(vprod1xCDEFc2, vbCDEFc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+        vprod2xCDEFc2 = vmlal_s8(vprod2xCDEFc2, vbCDEFc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 2)));
+        vprod3xCDEFc2 = vmlal_s8(vprod3xCDEFc2, vbCDEFc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 2)));
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+        vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
+        vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2);
+        vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc2);
+        int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+        int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+        int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 3)));
+        int16x8_t vprod3x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 3)));
+        const int8x8_t vb0123c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x0123c3 = vmlal_s8(vprod0x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+        vprod1x0123c3 = vmlal_s8(vprod1x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+        vprod2x0123c3 = vmlal_s8(vprod2x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 3)));
+        vprod3x0123c3 = vmlal_s8(vprod3x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 3)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
+        vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c3);
+        int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+        int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+        int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 3)));
+        int16x8_t vprod3x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 3)));
+        const int8x8_t vb4567c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x4567c3 = vmlal_s8(vprod0x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+        vprod1x4567c3 = vmlal_s8(vprod1x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+        vprod2x4567c3 = vmlal_s8(vprod2x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 3)));
+        vprod3x4567c3 = vmlal_s8(vprod3x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 3)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
+        vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c3);
+        int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+        int16x8_t vprod1x89ABc3 = vmull_s8(vb89ABc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+        int16x8_t vprod2x89ABc3 = vmull_s8(vb89ABc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 3)));
+        int16x8_t vprod3x89ABc3 = vmull_s8(vb89ABc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 3)));
+        const int8x8_t vb89ABc3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x89ABc3 = vmlal_s8(vprod0x89ABc3, vb89ABc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+        vprod1x89ABc3 = vmlal_s8(vprod1x89ABc3, vb89ABc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+        vprod2x89ABc3 = vmlal_s8(vprod2x89ABc3, vb89ABc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 3)));
+        vprod3x89ABc3 = vmlal_s8(vprod3x89ABc3, vb89ABc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 3)));
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
+        vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc3);
+        vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3);
+        vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc3);
+        int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+        int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+        int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 3)));
+        int16x8_t vprod3xCDEFc3 = vmull_s8(vbCDEFc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 3)));
+        const int8x8_t vbCDEFc3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0xCDEFc3 = vmlal_s8(vprod0xCDEFc3, vbCDEFc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+        vprod1xCDEFc3 = vmlal_s8(vprod1xCDEFc3, vbCDEFc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+        vprod2xCDEFc3 = vmlal_s8(vprod2xCDEFc3, vbCDEFc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 3)));
+        vprod3xCDEFc3 = vmlal_s8(vprod3xCDEFc3, vbCDEFc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 3)));
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
+        vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3);
+        vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3);
+        vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc3);
+
+        k -= 16 * sizeof(int8_t);
+      }
+
+      if (k >= 8 * sizeof(int8_t)) {
+        const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+        const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+        const int8x8_t va3 = vld1_s8(a3); a3 += 8;
+
+        const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+        const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+        const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+        const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+        const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+        const int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
+        const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+        const int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
+        const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+        const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+        const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+        const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+        const int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        const int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+        const int16x8_t vprod1x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+        vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
+        vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
+        vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
+        vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc3);
+        const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+        const int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+        vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
+        vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
+        vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
+        vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3);
+        const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+        const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+        const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+        const int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
+        const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+        const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+        const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+        const int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
+        const int16x8_t vprod2x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+        const int16x8_t vprod2x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+        const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+        const int16x8_t vprod2x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+        vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0);
+        vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1);
+        vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2);
+        vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3);
+        const int16x8_t vprod2xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+        const int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+        const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+        const int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+        vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0);
+        vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1);
+        vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2);
+        vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3);
+        const int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+        const int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+        const int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+        const int16x8_t vprod3x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
+        vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c0);
+        vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c1);
+        vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c2);
+        vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c3);
+        const int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+        const int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+        const int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+        const int16x8_t vprod3x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
+        vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c0);
+        vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c1);
+        vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c2);
+        vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c3);
+        const int16x8_t vprod3x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+        const int16x8_t vprod3x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+        const int16x8_t vprod3x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+        const int16x8_t vprod3x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
+        vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc0);
+        vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc1);
+        vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc2);
+        vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc3);
+        const int16x8_t vprod3xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+        const int16x8_t vprod3xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+        const int16x8_t vprod3xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+        const int16x8_t vprod3xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
+        vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc0);
+        vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc1);
+        vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc2);
+        vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc3);
+
+        k -= 8 * sizeof(int8_t);
+      }
+
+      if XNN_UNLIKELY(k != 0) {
+        const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+        const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+        const int8x8_t va2 = vld1_s8(a2); a2 = (const int8_t*) ((uintptr_t) a2 + k);
+        const int8x8_t va3 = vld1_s8(a3); a3 = (const int8_t*) ((uintptr_t) a3 + k);
+
+        const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+        const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+        const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+        const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+        const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+        const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+        const int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
+        const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
+        const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+        const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+        const int16x8_t vprod2x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+        vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0);
+        const int16x8_t vprod2xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+        vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0);
+        const int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+        vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c0);
+        const int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+        vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c0);
+        const int16x8_t vprod3x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+        vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc0);
+        const int16x8_t vprod3xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+        vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc0);
+
+        if (k > 2 * sizeof(int8_t)) {
+          const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+          vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+          const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+          vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+          const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+          vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+          const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+          vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+          const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+          vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+          const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+          vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+          const int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+          vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
+          const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+          vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
+          const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+          vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+          const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+          vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+          const int16x8_t vprod2x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+          vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1);
+          const int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+          vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1);
+          const int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+          vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c1);
+          const int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+          vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c1);
+          const int16x8_t vprod3x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+          vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc1);
+          const int16x8_t vprod3xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+          vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc1);
+
+          if (k > 4 * sizeof(int8_t)) {
+            const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+            const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+            vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+            const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+            vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+            const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+            vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+            const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+            vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+            const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+            vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+            const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+            vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+            const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+            vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
+            const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+            vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
+            const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+            vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+            const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+            vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+            const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+            vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2);
+            const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+            vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2);
+            const int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+            vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c2);
+            const int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+            vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c2);
+            const int16x8_t vprod3x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+            vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc2);
+            const int16x8_t vprod3xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+            vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc2);
+          }
+        }
+      }
+      p -= 4 * sizeof(void*);
+    } while (p != 0);
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+    vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc1x89AB = vqrdmulhq_s32(vacc1x89AB, vmultiplier);
+    vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+    vacc2x89AB = vqrdmulhq_s32(vacc2x89AB, vmultiplier);
+    vacc2xCDEF = vqrdmulhq_s32(vacc2xCDEF, vmultiplier);
+    vacc3x0123 = vqrdmulhq_s32(vacc3x0123, vmultiplier);
+    vacc3x4567 = vqrdmulhq_s32(vacc3x4567, vmultiplier);
+    vacc3x89AB = vqrdmulhq_s32(vacc3x89AB, vmultiplier);
+    vacc3xCDEF = vqrdmulhq_s32(vacc3xCDEF, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+    vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc1x89AB = vsraq_n_s32(vacc1x89AB, vbicq_s32(vacc1x89AB, vzero_shift_mask), 31);
+    vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+    vacc2x89AB = vsraq_n_s32(vacc2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31);
+    vacc2xCDEF = vsraq_n_s32(vacc2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31);
+    vacc3x0123 = vsraq_n_s32(vacc3x0123, vbicq_s32(vacc3x0123, vzero_shift_mask), 31);
+    vacc3x4567 = vsraq_n_s32(vacc3x4567, vbicq_s32(vacc3x4567, vzero_shift_mask), 31);
+    vacc3x89AB = vsraq_n_s32(vacc3x89AB, vbicq_s32(vacc3x89AB, vzero_shift_mask), 31);
+    vacc3xCDEF = vsraq_n_s32(vacc3xCDEF, vbicq_s32(vacc3xCDEF, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_shift);
+    vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+    vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_shift);
+    vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_shift);
+    vacc3x0123 = vrshlq_s32(vacc3x0123, vright_shift);
+    vacc3x4567 = vrshlq_s32(vacc3x4567, vright_shift);
+    vacc3x89AB = vrshlq_s32(vacc3x89AB, vright_shift);
+    vacc3xCDEF = vrshlq_s32(vacc3xCDEF, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
+    const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+    int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
+    int8x16_t vout2x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc2x89ABCDEF);
+    int8x16_t vout3x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc3x01234567), vacc3x89ABCDEF);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567)), voutput_zero_point);
+    const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF)), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+    int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
+    int8x16_t vout2x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc2x89ABCDEF));
+    int8x16_t vout3x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc3x01234567), vqmovn_s16(vacc3x89ABCDEF));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout3x0123456789ABCDEF = vmaxq_s8(vout3x0123456789ABCDEF, voutput_min);
+    vout2x0123456789ABCDEF = vmaxq_s8(vout2x0123456789ABCDEF, voutput_min);
+    vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
+    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+
+    vout3x0123456789ABCDEF = vminq_s8(vout3x0123456789ABCDEF, voutput_max);
+    vout2x0123456789ABCDEF = vminq_s8(vout2x0123456789ABCDEF, voutput_max);
+    vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
+    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+
+    if (nc >= 16) {
+      vst1q_s8(c3 + 0, vout3x0123456789ABCDEF);
+      vst1q_s8(c2 + 0, vout2x0123456789ABCDEF);
+      vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
+      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+
+      c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 16;
+    } else {
+      int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
+      int8x16_t vout2x01234567_3x01234567 = vcombine_s8(vget_low_s8(vout2x0123456789ABCDEF), vget_low_s8(vout3x0123456789ABCDEF));
+      if (nc & 8) {
+        vst1_s8(c3, vget_high_s8(vout2x01234567_3x01234567)); c3 += 8;
+        vst1_s8(c2, vget_low_s8(vout2x01234567_3x01234567)); c2 += 8;
+        vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
+        vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
+        vout2x01234567_3x01234567 = vcombine_s8(vget_high_s8(vout2x0123456789ABCDEF), vget_high_s8(vout3x0123456789ABCDEF));
+        vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
+      }
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c3, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 2); c3 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 0); c2 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c3, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 4); c3 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 0); c2 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c3, vout2x01234567_3x01234567, 8);
+        vst1q_lane_s8(c2, vout2x01234567_3x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/4x16c2-minmax-neon-mull-padal-dup.c b/src/qs8-igemm/gen/4x16c2-minmax-neon-mull-padal-dup.c
new file mode 100644
index 0000000..6b8b47e
--- /dev/null
+++ b/src/qs8-igemm/gen/4x16c2-minmax-neon-mull-padal-dup.c
@@ -0,0 +1,529 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/c2-neon-mull-padal-dup.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (4 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+  int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    c3 = c2;
+  }
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc1x0123 = vacc0x0123;
+    int32x4_t vacc1x4567 = vacc0x4567;
+    int32x4_t vacc1x89AB = vacc0x89AB;
+    int32x4_t vacc1xCDEF = vacc0xCDEF;
+    int32x4_t vacc2x0123 = vacc0x0123;
+    int32x4_t vacc2x4567 = vacc0x4567;
+    int32x4_t vacc2x89AB = vacc0x89AB;
+    int32x4_t vacc2xCDEF = vacc0xCDEF;
+    int32x4_t vacc3x0123 = vacc0x0123;
+    int32x4_t vacc3x4567 = vacc0x4567;
+    int32x4_t vacc3x89AB = vacc0x89AB;
+    int32x4_t vacc3xCDEF = vacc0xCDEF;
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      const int8_t* restrict a3 = a[3];
+      if XNN_UNPREDICTABLE(a3 != zero) {
+        a3 = (const int8_t*) ((uintptr_t) a3 + a_offset);
+      }
+      a += 4;
+
+      size_t k = kc;
+
+
+      while (k >= 8 * sizeof(int8_t)) {
+        const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+        const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+        const int8x8_t va3 = vld1_s8(a3); a3 += 8;
+
+        const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+        const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+        const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+        const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+        const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+        const int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
+        const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+        const int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
+        const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+        const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+        const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+        const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+        const int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        const int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+        const int16x8_t vprod1x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+        vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
+        vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
+        vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
+        vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc3);
+        const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+        const int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+        vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
+        vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
+        vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
+        vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3);
+        const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+        const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+        const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+        const int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
+        const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+        const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+        const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+        const int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
+        const int16x8_t vprod2x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+        const int16x8_t vprod2x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+        const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+        const int16x8_t vprod2x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+        vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0);
+        vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1);
+        vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2);
+        vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3);
+        const int16x8_t vprod2xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+        const int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+        const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+        const int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+        vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0);
+        vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1);
+        vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2);
+        vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3);
+        const int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+        const int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+        const int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+        const int16x8_t vprod3x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
+        vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c0);
+        vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c1);
+        vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c2);
+        vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c3);
+        const int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+        const int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+        const int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+        const int16x8_t vprod3x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
+        vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c0);
+        vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c1);
+        vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c2);
+        vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c3);
+        const int16x8_t vprod3x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+        const int16x8_t vprod3x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+        const int16x8_t vprod3x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+        const int16x8_t vprod3x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
+        vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc0);
+        vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc1);
+        vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc2);
+        vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc3);
+        const int16x8_t vprod3xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+        const int16x8_t vprod3xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+        const int16x8_t vprod3xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+        const int16x8_t vprod3xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
+        vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc0);
+        vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc1);
+        vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc2);
+        vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc3);
+
+        k -= 8 * sizeof(int8_t);
+      }
+
+      if XNN_UNLIKELY(k != 0) {
+        const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+        const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+        const int8x8_t va2 = vld1_s8(a2); a2 = (const int8_t*) ((uintptr_t) a2 + k);
+        const int8x8_t va3 = vld1_s8(a3); a3 = (const int8_t*) ((uintptr_t) a3 + k);
+
+        const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+        const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+        const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+        const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+        const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+        const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+        const int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
+        const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
+        const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+        const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+        const int16x8_t vprod2x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+        vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0);
+        const int16x8_t vprod2xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+        vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0);
+        const int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+        vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c0);
+        const int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+        vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c0);
+        const int16x8_t vprod3x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+        vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc0);
+        const int16x8_t vprod3xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+        vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc0);
+
+        if (k > 2 * sizeof(int8_t)) {
+          const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+          vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+          const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+          vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+          const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+          vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+          const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+          vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+          const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+          vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+          const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+          vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+          const int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+          vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
+          const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+          vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
+          const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+          vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+          const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+          vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+          const int16x8_t vprod2x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+          vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1);
+          const int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+          vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1);
+          const int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+          vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c1);
+          const int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+          vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c1);
+          const int16x8_t vprod3x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+          vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc1);
+          const int16x8_t vprod3xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+          vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc1);
+
+          if (k > 4 * sizeof(int8_t)) {
+            const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+            const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+            vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+            const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+            vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+            const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+            vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+            const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+            vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+            const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+            vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+            const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+            vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+            const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+            vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
+            const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+            vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
+            const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+            vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+            const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+            vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+            const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+            vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2);
+            const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+            vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2);
+            const int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+            vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c2);
+            const int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+            vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c2);
+            const int16x8_t vprod3x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+            vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc2);
+            const int16x8_t vprod3xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+            vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc2);
+          }
+        }
+      }
+      p -= 4 * sizeof(void*);
+    } while (p != 0);
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+    vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc1x89AB = vqrdmulhq_s32(vacc1x89AB, vmultiplier);
+    vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+    vacc2x89AB = vqrdmulhq_s32(vacc2x89AB, vmultiplier);
+    vacc2xCDEF = vqrdmulhq_s32(vacc2xCDEF, vmultiplier);
+    vacc3x0123 = vqrdmulhq_s32(vacc3x0123, vmultiplier);
+    vacc3x4567 = vqrdmulhq_s32(vacc3x4567, vmultiplier);
+    vacc3x89AB = vqrdmulhq_s32(vacc3x89AB, vmultiplier);
+    vacc3xCDEF = vqrdmulhq_s32(vacc3xCDEF, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+    vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc1x89AB = vsraq_n_s32(vacc1x89AB, vbicq_s32(vacc1x89AB, vzero_shift_mask), 31);
+    vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+    vacc2x89AB = vsraq_n_s32(vacc2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31);
+    vacc2xCDEF = vsraq_n_s32(vacc2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31);
+    vacc3x0123 = vsraq_n_s32(vacc3x0123, vbicq_s32(vacc3x0123, vzero_shift_mask), 31);
+    vacc3x4567 = vsraq_n_s32(vacc3x4567, vbicq_s32(vacc3x4567, vzero_shift_mask), 31);
+    vacc3x89AB = vsraq_n_s32(vacc3x89AB, vbicq_s32(vacc3x89AB, vzero_shift_mask), 31);
+    vacc3xCDEF = vsraq_n_s32(vacc3xCDEF, vbicq_s32(vacc3xCDEF, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_shift);
+    vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+    vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_shift);
+    vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_shift);
+    vacc3x0123 = vrshlq_s32(vacc3x0123, vright_shift);
+    vacc3x4567 = vrshlq_s32(vacc3x4567, vright_shift);
+    vacc3x89AB = vrshlq_s32(vacc3x89AB, vright_shift);
+    vacc3xCDEF = vrshlq_s32(vacc3xCDEF, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
+    const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+    int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
+    int8x16_t vout2x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc2x89ABCDEF);
+    int8x16_t vout3x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc3x01234567), vacc3x89ABCDEF);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567)), voutput_zero_point);
+    const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF)), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+    int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
+    int8x16_t vout2x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc2x89ABCDEF));
+    int8x16_t vout3x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc3x01234567), vqmovn_s16(vacc3x89ABCDEF));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout3x0123456789ABCDEF = vmaxq_s8(vout3x0123456789ABCDEF, voutput_min);
+    vout2x0123456789ABCDEF = vmaxq_s8(vout2x0123456789ABCDEF, voutput_min);
+    vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
+    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+
+    vout3x0123456789ABCDEF = vminq_s8(vout3x0123456789ABCDEF, voutput_max);
+    vout2x0123456789ABCDEF = vminq_s8(vout2x0123456789ABCDEF, voutput_max);
+    vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
+    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+
+    if (nc >= 16) {
+      vst1q_s8(c3 + 0, vout3x0123456789ABCDEF);
+      vst1q_s8(c2 + 0, vout2x0123456789ABCDEF);
+      vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
+      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+
+      c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 16;
+    } else {
+      int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
+      int8x16_t vout2x01234567_3x01234567 = vcombine_s8(vget_low_s8(vout2x0123456789ABCDEF), vget_low_s8(vout3x0123456789ABCDEF));
+      if (nc & 8) {
+        vst1_s8(c3, vget_high_s8(vout2x01234567_3x01234567)); c3 += 8;
+        vst1_s8(c2, vget_low_s8(vout2x01234567_3x01234567)); c2 += 8;
+        vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
+        vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
+        vout2x01234567_3x01234567 = vcombine_s8(vget_high_s8(vout2x0123456789ABCDEF), vget_high_s8(vout3x0123456789ABCDEF));
+        vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
+      }
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c3, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 2); c3 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 0); c2 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c3, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 4); c3 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 0); c2 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c3, vout2x01234567_3x01234567, 8);
+        vst1q_lane_s8(c2, vout2x01234567_3x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/4x16c4-minmax-neondot.c b/src/qs8-igemm/gen/4x16c4-minmax-neondot.c
index 253ba24..c4d0481 100644
--- a/src/qs8-igemm/gen/4x16c4-minmax-neondot.c
+++ b/src/qs8-igemm/gen/4x16c4-minmax-neondot.c
@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot(
@@ -40,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 4);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
@@ -103,14 +104,14 @@
         const int8x8_t va3x01234567 = vld1_s8(a3); a3 += 8;
 
         // Load a 8x16 block of weights.
-        const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb0123x89AB = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb0123xCDEF = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb4567x89AB = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb4567xCDEF = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
+        const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb0123x89AB = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb0123xCDEF = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb4567x89AB = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb4567xCDEF = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
 
         // Multiply-accumulate: 4x8 * 8x16 --> 4x16.
         vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb0123x0123, va0x01234567, 0);
@@ -148,7 +149,7 @@
 
         k -= 8 * sizeof(int8_t);
       }
-      // Handle up to 7 final positions of `k`
+      // Handle up to 4 final positions of `k`
       if XNN_UNLIKELY(k != 0) {
         // Load a 4x4 block of activations.
         const int8x8_t va0x01234567 = vld1_s8(a0);
@@ -157,10 +158,10 @@
         const int8x8_t va3x01234567 = vld1_s8(a3);
 
         // Load a 4x16 block of weights.
-        const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb0123x89AB = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb0123xCDEF = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
+        const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb0123x89AB = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb0123xCDEF = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
 
         // Multiply-accumulate: 4x4 * 4x16 --> 4x16.
         vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb0123x0123, va0x01234567, 0);
@@ -179,32 +180,6 @@
         vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb0123x4567, va3x01234567, 0);
         vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb0123x89AB, va3x01234567, 0);
         vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb0123xCDEF, va3x01234567, 0);
-
-        if (k > 4) {
-          // Load a 4x16 block of weights.
-          const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-          const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-          const int8x16_t vb4567x89AB = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-          const int8x16_t vb4567xCDEF = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-
-          // Multiply-accumulate: 4x4 * 4x16 --> 4x16.
-          vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
-          vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
-          vacc0x89AB = vdotq_lane_s32(vacc0x89AB, vb4567x89AB, va0x01234567, 1);
-          vacc0xCDEF = vdotq_lane_s32(vacc0xCDEF, vb4567xCDEF, va0x01234567, 1);
-          vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1);
-          vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1);
-          vacc1x89AB = vdotq_lane_s32(vacc1x89AB, vb4567x89AB, va1x01234567, 1);
-          vacc1xCDEF = vdotq_lane_s32(vacc1xCDEF, vb4567xCDEF, va1x01234567, 1);
-          vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1);
-          vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1);
-          vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb4567x89AB, va2x01234567, 1);
-          vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb4567xCDEF, va2x01234567, 1);
-          vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb4567x0123, va3x01234567, 1);
-          vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb4567x4567, va3x01234567, 1);
-          vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb4567x89AB, va3x01234567, 1);
-          vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb4567xCDEF, va3x01234567, 1);
-        }
       }
       p -= 4 * sizeof(void*);
     } while (p != 0);
diff --git a/src/qs8-igemm/gen/4x16c8-minmax-avx512skx.c b/src/qs8-igemm/gen/4x16c8-minmax-avx512skx.c
index b16ed2c..4f724c4 100644
--- a/src/qs8-igemm/gen/4x16c8-minmax-avx512skx.c
+++ b/src/qs8-igemm/gen/4x16c8-minmax-avx512skx.c
@@ -13,6 +13,7 @@
 
 #include <xnnpack/igemm.h>
 #include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx(
@@ -38,6 +39,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
diff --git a/src/qs8-igemm/gen/4x16c8-minmax-neon-mlal-padal.c b/src/qs8-igemm/gen/4x16c8-minmax-neon-mlal-padal.c
new file mode 100644
index 0000000..29b7141
--- /dev/null
+++ b/src/qs8-igemm/gen/4x16c8-minmax-neon-mlal-padal.c
@@ -0,0 +1,854 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/c8-neon-mull-padal.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (4 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+  int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    c3 = c2;
+  }
+
+  do {
+    int32x4_t vacc0x0 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x1 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x2 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x3 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x4 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x5 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x8 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x9 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x10 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x11 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x12 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x13 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x14 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x15 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc1x0 = vacc0x0;
+    int32x4_t vacc1x1 = vacc0x1;
+    int32x4_t vacc1x2 = vacc0x2;
+    int32x4_t vacc1x3 = vacc0x3;
+    int32x4_t vacc1x4 = vacc0x4;
+    int32x4_t vacc1x5 = vacc0x5;
+    int32x4_t vacc1x6 = vacc0x6;
+    int32x4_t vacc1x7 = vacc0x7;
+    int32x4_t vacc1x8 = vacc0x8;
+    int32x4_t vacc1x9 = vacc0x9;
+    int32x4_t vacc1x10 = vacc0x10;
+    int32x4_t vacc1x11 = vacc0x11;
+    int32x4_t vacc1x12 = vacc0x12;
+    int32x4_t vacc1x13 = vacc0x13;
+    int32x4_t vacc1x14 = vacc0x14;
+    int32x4_t vacc1x15 = vacc0x15;
+    int32x4_t vacc2x0 = vacc0x0;
+    int32x4_t vacc2x1 = vacc0x1;
+    int32x4_t vacc2x2 = vacc0x2;
+    int32x4_t vacc2x3 = vacc0x3;
+    int32x4_t vacc2x4 = vacc0x4;
+    int32x4_t vacc2x5 = vacc0x5;
+    int32x4_t vacc2x6 = vacc0x6;
+    int32x4_t vacc2x7 = vacc0x7;
+    int32x4_t vacc2x8 = vacc0x8;
+    int32x4_t vacc2x9 = vacc0x9;
+    int32x4_t vacc2x10 = vacc0x10;
+    int32x4_t vacc2x11 = vacc0x11;
+    int32x4_t vacc2x12 = vacc0x12;
+    int32x4_t vacc2x13 = vacc0x13;
+    int32x4_t vacc2x14 = vacc0x14;
+    int32x4_t vacc2x15 = vacc0x15;
+    int32x4_t vacc3x0 = vacc0x0;
+    int32x4_t vacc3x1 = vacc0x1;
+    int32x4_t vacc3x2 = vacc0x2;
+    int32x4_t vacc3x3 = vacc0x3;
+    int32x4_t vacc3x4 = vacc0x4;
+    int32x4_t vacc3x5 = vacc0x5;
+    int32x4_t vacc3x6 = vacc0x6;
+    int32x4_t vacc3x7 = vacc0x7;
+    int32x4_t vacc3x8 = vacc0x8;
+    int32x4_t vacc3x9 = vacc0x9;
+    int32x4_t vacc3x10 = vacc0x10;
+    int32x4_t vacc3x11 = vacc0x11;
+    int32x4_t vacc3x12 = vacc0x12;
+    int32x4_t vacc3x13 = vacc0x13;
+    int32x4_t vacc3x14 = vacc0x14;
+    int32x4_t vacc3x15 = vacc0x15;
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      const int8_t* restrict a3 = a[3];
+      if XNN_UNPREDICTABLE(a3 != zero) {
+        a3 = (const int8_t*) ((uintptr_t) a3 + a_offset);
+      }
+      a += 4;
+
+      size_t k = kc;
+      // 2x partial unrolled loop to load 16 bytes at a time using MLA.
+      while (k >= 16 * sizeof(int8_t)) {
+        const int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va0x1 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va1x0 = vld1_s8(a1); a1 += 8;
+        const int8x8_t va1x1 = vld1_s8(a1); a1 += 8;
+        const int8x8_t va2x0 = vld1_s8(a2); a2 += 8;
+        const int8x8_t va2x1 = vld1_s8(a2); a2 += 8;
+        const int8x8_t va3x0 = vld1_s8(a3); a3 += 8;
+        const int8x8_t va3x1 = vld1_s8(a3); a3 += 8;
+
+        const int8x8_t vb0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb4x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb5x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb6x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb7x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb8x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb9x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb10x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb11x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb12x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb13x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb14x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb15x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+
+        const int8x8_t vb0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x0 = vmull_s8(vb0x0, va0x0);
+        int16x8_t vprod1x0 = vmull_s8(vb0x0, va1x0);
+        int16x8_t vprod2x0 = vmull_s8(vb0x0, va2x0);
+        int16x8_t vprod3x0 = vmull_s8(vb0x0, va3x0);
+        vprod0x0 = vmlal_s8(vprod0x0, vb0x1, va0x1);
+        vprod1x0 = vmlal_s8(vprod1x0, vb0x1, va1x1);
+        vprod2x0 = vmlal_s8(vprod2x0, vb0x1, va2x1);
+        vprod3x0 = vmlal_s8(vprod3x0, vb0x1, va3x1);
+        vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+        vacc1x0 = vpadalq_s16(vacc1x0, vprod1x0);
+        vacc2x0 = vpadalq_s16(vacc2x0, vprod2x0);
+        vacc3x0 = vpadalq_s16(vacc3x0, vprod3x0);
+        const int8x8_t vb1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x1 = vmull_s8(vb1x0, va0x0);
+        int16x8_t vprod1x1 = vmull_s8(vb1x0, va1x0);
+        int16x8_t vprod2x1 = vmull_s8(vb1x0, va2x0);
+        int16x8_t vprod3x1 = vmull_s8(vb1x0, va3x0);
+        vprod0x1 = vmlal_s8(vprod0x1, vb1x1, va0x1);
+        vprod1x1 = vmlal_s8(vprod1x1, vb1x1, va1x1);
+        vprod2x1 = vmlal_s8(vprod2x1, vb1x1, va2x1);
+        vprod3x1 = vmlal_s8(vprod3x1, vb1x1, va3x1);
+        vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+        vacc1x1 = vpadalq_s16(vacc1x1, vprod1x1);
+        vacc2x1 = vpadalq_s16(vacc2x1, vprod2x1);
+        vacc3x1 = vpadalq_s16(vacc3x1, vprod3x1);
+        const int8x8_t vb2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x2 = vmull_s8(vb2x0, va0x0);
+        int16x8_t vprod1x2 = vmull_s8(vb2x0, va1x0);
+        int16x8_t vprod2x2 = vmull_s8(vb2x0, va2x0);
+        int16x8_t vprod3x2 = vmull_s8(vb2x0, va3x0);
+        vprod0x2 = vmlal_s8(vprod0x2, vb2x1, va0x1);
+        vprod1x2 = vmlal_s8(vprod1x2, vb2x1, va1x1);
+        vprod2x2 = vmlal_s8(vprod2x2, vb2x1, va2x1);
+        vprod3x2 = vmlal_s8(vprod3x2, vb2x1, va3x1);
+        vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+        vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2);
+        vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2);
+        vacc3x2 = vpadalq_s16(vacc3x2, vprod3x2);
+        const int8x8_t vb3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x3 = vmull_s8(vb3x0, va0x0);
+        int16x8_t vprod1x3 = vmull_s8(vb3x0, va1x0);
+        int16x8_t vprod2x3 = vmull_s8(vb3x0, va2x0);
+        int16x8_t vprod3x3 = vmull_s8(vb3x0, va3x0);
+        vprod0x3 = vmlal_s8(vprod0x3, vb3x1, va0x1);
+        vprod1x3 = vmlal_s8(vprod1x3, vb3x1, va1x1);
+        vprod2x3 = vmlal_s8(vprod2x3, vb3x1, va2x1);
+        vprod3x3 = vmlal_s8(vprod3x3, vb3x1, va3x1);
+        vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+        vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3);
+        vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3);
+        vacc3x3 = vpadalq_s16(vacc3x3, vprod3x3);
+        const int8x8_t vb4x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x4 = vmull_s8(vb4x0, va0x0);
+        int16x8_t vprod1x4 = vmull_s8(vb4x0, va1x0);
+        int16x8_t vprod2x4 = vmull_s8(vb4x0, va2x0);
+        int16x8_t vprod3x4 = vmull_s8(vb4x0, va3x0);
+        vprod0x4 = vmlal_s8(vprod0x4, vb4x1, va0x1);
+        vprod1x4 = vmlal_s8(vprod1x4, vb4x1, va1x1);
+        vprod2x4 = vmlal_s8(vprod2x4, vb4x1, va2x1);
+        vprod3x4 = vmlal_s8(vprod3x4, vb4x1, va3x1);
+        vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+        vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4);
+        vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4);
+        vacc3x4 = vpadalq_s16(vacc3x4, vprod3x4);
+        const int8x8_t vb5x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x5 = vmull_s8(vb5x0, va0x0);
+        int16x8_t vprod1x5 = vmull_s8(vb5x0, va1x0);
+        int16x8_t vprod2x5 = vmull_s8(vb5x0, va2x0);
+        int16x8_t vprod3x5 = vmull_s8(vb5x0, va3x0);
+        vprod0x5 = vmlal_s8(vprod0x5, vb5x1, va0x1);
+        vprod1x5 = vmlal_s8(vprod1x5, vb5x1, va1x1);
+        vprod2x5 = vmlal_s8(vprod2x5, vb5x1, va2x1);
+        vprod3x5 = vmlal_s8(vprod3x5, vb5x1, va3x1);
+        vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+        vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5);
+        vacc2x5 = vpadalq_s16(vacc2x5, vprod2x5);
+        vacc3x5 = vpadalq_s16(vacc3x5, vprod3x5);
+        const int8x8_t vb6x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x6 = vmull_s8(vb6x0, va0x0);
+        int16x8_t vprod1x6 = vmull_s8(vb6x0, va1x0);
+        int16x8_t vprod2x6 = vmull_s8(vb6x0, va2x0);
+        int16x8_t vprod3x6 = vmull_s8(vb6x0, va3x0);
+        vprod0x6 = vmlal_s8(vprod0x6, vb6x1, va0x1);
+        vprod1x6 = vmlal_s8(vprod1x6, vb6x1, va1x1);
+        vprod2x6 = vmlal_s8(vprod2x6, vb6x1, va2x1);
+        vprod3x6 = vmlal_s8(vprod3x6, vb6x1, va3x1);
+        vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+        vacc1x6 = vpadalq_s16(vacc1x6, vprod1x6);
+        vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6);
+        vacc3x6 = vpadalq_s16(vacc3x6, vprod3x6);
+        const int8x8_t vb7x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x7 = vmull_s8(vb7x0, va0x0);
+        int16x8_t vprod1x7 = vmull_s8(vb7x0, va1x0);
+        int16x8_t vprod2x7 = vmull_s8(vb7x0, va2x0);
+        int16x8_t vprod3x7 = vmull_s8(vb7x0, va3x0);
+        vprod0x7 = vmlal_s8(vprod0x7, vb7x1, va0x1);
+        vprod1x7 = vmlal_s8(vprod1x7, vb7x1, va1x1);
+        vprod2x7 = vmlal_s8(vprod2x7, vb7x1, va2x1);
+        vprod3x7 = vmlal_s8(vprod3x7, vb7x1, va3x1);
+        vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+        vacc1x7 = vpadalq_s16(vacc1x7, vprod1x7);
+        vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7);
+        vacc3x7 = vpadalq_s16(vacc3x7, vprod3x7);
+        const int8x8_t vb8x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x8 = vmull_s8(vb8x0, va0x0);
+        int16x8_t vprod1x8 = vmull_s8(vb8x0, va1x0);
+        int16x8_t vprod2x8 = vmull_s8(vb8x0, va2x0);
+        int16x8_t vprod3x8 = vmull_s8(vb8x0, va3x0);
+        vprod0x8 = vmlal_s8(vprod0x8, vb8x1, va0x1);
+        vprod1x8 = vmlal_s8(vprod1x8, vb8x1, va1x1);
+        vprod2x8 = vmlal_s8(vprod2x8, vb8x1, va2x1);
+        vprod3x8 = vmlal_s8(vprod3x8, vb8x1, va3x1);
+        vacc0x8 = vpadalq_s16(vacc0x8, vprod0x8);
+        vacc1x8 = vpadalq_s16(vacc1x8, vprod1x8);
+        vacc2x8 = vpadalq_s16(vacc2x8, vprod2x8);
+        vacc3x8 = vpadalq_s16(vacc3x8, vprod3x8);
+        const int8x8_t vb9x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x9 = vmull_s8(vb9x0, va0x0);
+        int16x8_t vprod1x9 = vmull_s8(vb9x0, va1x0);
+        int16x8_t vprod2x9 = vmull_s8(vb9x0, va2x0);
+        int16x8_t vprod3x9 = vmull_s8(vb9x0, va3x0);
+        vprod0x9 = vmlal_s8(vprod0x9, vb9x1, va0x1);
+        vprod1x9 = vmlal_s8(vprod1x9, vb9x1, va1x1);
+        vprod2x9 = vmlal_s8(vprod2x9, vb9x1, va2x1);
+        vprod3x9 = vmlal_s8(vprod3x9, vb9x1, va3x1);
+        vacc0x9 = vpadalq_s16(vacc0x9, vprod0x9);
+        vacc1x9 = vpadalq_s16(vacc1x9, vprod1x9);
+        vacc2x9 = vpadalq_s16(vacc2x9, vprod2x9);
+        vacc3x9 = vpadalq_s16(vacc3x9, vprod3x9);
+        const int8x8_t vb10x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x10 = vmull_s8(vb10x0, va0x0);
+        int16x8_t vprod1x10 = vmull_s8(vb10x0, va1x0);
+        int16x8_t vprod2x10 = vmull_s8(vb10x0, va2x0);
+        int16x8_t vprod3x10 = vmull_s8(vb10x0, va3x0);
+        vprod0x10 = vmlal_s8(vprod0x10, vb10x1, va0x1);
+        vprod1x10 = vmlal_s8(vprod1x10, vb10x1, va1x1);
+        vprod2x10 = vmlal_s8(vprod2x10, vb10x1, va2x1);
+        vprod3x10 = vmlal_s8(vprod3x10, vb10x1, va3x1);
+        vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10);
+        vacc1x10 = vpadalq_s16(vacc1x10, vprod1x10);
+        vacc2x10 = vpadalq_s16(vacc2x10, vprod2x10);
+        vacc3x10 = vpadalq_s16(vacc3x10, vprod3x10);
+        const int8x8_t vb11x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x11 = vmull_s8(vb11x0, va0x0);
+        int16x8_t vprod1x11 = vmull_s8(vb11x0, va1x0);
+        int16x8_t vprod2x11 = vmull_s8(vb11x0, va2x0);
+        int16x8_t vprod3x11 = vmull_s8(vb11x0, va3x0);
+        vprod0x11 = vmlal_s8(vprod0x11, vb11x1, va0x1);
+        vprod1x11 = vmlal_s8(vprod1x11, vb11x1, va1x1);
+        vprod2x11 = vmlal_s8(vprod2x11, vb11x1, va2x1);
+        vprod3x11 = vmlal_s8(vprod3x11, vb11x1, va3x1);
+        vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11);
+        vacc1x11 = vpadalq_s16(vacc1x11, vprod1x11);
+        vacc2x11 = vpadalq_s16(vacc2x11, vprod2x11);
+        vacc3x11 = vpadalq_s16(vacc3x11, vprod3x11);
+        const int8x8_t vb12x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x12 = vmull_s8(vb12x0, va0x0);
+        int16x8_t vprod1x12 = vmull_s8(vb12x0, va1x0);
+        int16x8_t vprod2x12 = vmull_s8(vb12x0, va2x0);
+        int16x8_t vprod3x12 = vmull_s8(vb12x0, va3x0);
+        vprod0x12 = vmlal_s8(vprod0x12, vb12x1, va0x1);
+        vprod1x12 = vmlal_s8(vprod1x12, vb12x1, va1x1);
+        vprod2x12 = vmlal_s8(vprod2x12, vb12x1, va2x1);
+        vprod3x12 = vmlal_s8(vprod3x12, vb12x1, va3x1);
+        vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12);
+        vacc1x12 = vpadalq_s16(vacc1x12, vprod1x12);
+        vacc2x12 = vpadalq_s16(vacc2x12, vprod2x12);
+        vacc3x12 = vpadalq_s16(vacc3x12, vprod3x12);
+        const int8x8_t vb13x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x13 = vmull_s8(vb13x0, va0x0);
+        int16x8_t vprod1x13 = vmull_s8(vb13x0, va1x0);
+        int16x8_t vprod2x13 = vmull_s8(vb13x0, va2x0);
+        int16x8_t vprod3x13 = vmull_s8(vb13x0, va3x0);
+        vprod0x13 = vmlal_s8(vprod0x13, vb13x1, va0x1);
+        vprod1x13 = vmlal_s8(vprod1x13, vb13x1, va1x1);
+        vprod2x13 = vmlal_s8(vprod2x13, vb13x1, va2x1);
+        vprod3x13 = vmlal_s8(vprod3x13, vb13x1, va3x1);
+        vacc0x13 = vpadalq_s16(vacc0x13, vprod0x13);
+        vacc1x13 = vpadalq_s16(vacc1x13, vprod1x13);
+        vacc2x13 = vpadalq_s16(vacc2x13, vprod2x13);
+        vacc3x13 = vpadalq_s16(vacc3x13, vprod3x13);
+        const int8x8_t vb14x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x14 = vmull_s8(vb14x0, va0x0);
+        int16x8_t vprod1x14 = vmull_s8(vb14x0, va1x0);
+        int16x8_t vprod2x14 = vmull_s8(vb14x0, va2x0);
+        int16x8_t vprod3x14 = vmull_s8(vb14x0, va3x0);
+        vprod0x14 = vmlal_s8(vprod0x14, vb14x1, va0x1);
+        vprod1x14 = vmlal_s8(vprod1x14, vb14x1, va1x1);
+        vprod2x14 = vmlal_s8(vprod2x14, vb14x1, va2x1);
+        vprod3x14 = vmlal_s8(vprod3x14, vb14x1, va3x1);
+        vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14);
+        vacc1x14 = vpadalq_s16(vacc1x14, vprod1x14);
+        vacc2x14 = vpadalq_s16(vacc2x14, vprod2x14);
+        vacc3x14 = vpadalq_s16(vacc3x14, vprod3x14);
+        const int8x8_t vb15x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x15 = vmull_s8(vb15x0, va0x0);
+        int16x8_t vprod1x15 = vmull_s8(vb15x0, va1x0);
+        int16x8_t vprod2x15 = vmull_s8(vb15x0, va2x0);
+        int16x8_t vprod3x15 = vmull_s8(vb15x0, va3x0);
+        vprod0x15 = vmlal_s8(vprod0x15, vb15x1, va0x1);
+        vprod1x15 = vmlal_s8(vprod1x15, vb15x1, va1x1);
+        vprod2x15 = vmlal_s8(vprod2x15, vb15x1, va2x1);
+        vprod3x15 = vmlal_s8(vprod3x15, vb15x1, va3x1);
+        vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15);
+        vacc1x15 = vpadalq_s16(vacc1x15, vprod1x15);
+        vacc2x15 = vpadalq_s16(vacc2x15, vprod2x15);
+        vacc3x15 = vpadalq_s16(vacc3x15, vprod3x15);
+
+        k -= 16 * sizeof(int8_t);
+      }
+
+      // Handle 8 bytes at a time using MUL.
+      if (k > 0) {
+        const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+        const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+        const int8x8_t va3 = vld1_s8(a3); a3 += 8;
+
+        const int8x8_t vb0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x0 = vmull_s8(vb0, va0);
+        const int16x8_t vprod1x0 = vmull_s8(vb0, va1);
+        const int16x8_t vprod2x0 = vmull_s8(vb0, va2);
+        const int16x8_t vprod3x0 = vmull_s8(vb0, va3);
+        vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+        vacc1x0 = vpadalq_s16(vacc1x0, vprod1x0);
+        vacc2x0 = vpadalq_s16(vacc2x0, vprod2x0);
+        vacc3x0 = vpadalq_s16(vacc3x0, vprod3x0);
+        const int8x8_t vb1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x1 = vmull_s8(vb1, va0);
+        const int16x8_t vprod1x1 = vmull_s8(vb1, va1);
+        const int16x8_t vprod2x1 = vmull_s8(vb1, va2);
+        const int16x8_t vprod3x1 = vmull_s8(vb1, va3);
+        vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+        vacc1x1 = vpadalq_s16(vacc1x1, vprod1x1);
+        vacc2x1 = vpadalq_s16(vacc2x1, vprod2x1);
+        vacc3x1 = vpadalq_s16(vacc3x1, vprod3x1);
+        const int8x8_t vb2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x2 = vmull_s8(vb2, va0);
+        const int16x8_t vprod1x2 = vmull_s8(vb2, va1);
+        const int16x8_t vprod2x2 = vmull_s8(vb2, va2);
+        const int16x8_t vprod3x2 = vmull_s8(vb2, va3);
+        vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+        vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2);
+        vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2);
+        vacc3x2 = vpadalq_s16(vacc3x2, vprod3x2);
+        const int8x8_t vb3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x3 = vmull_s8(vb3, va0);
+        const int16x8_t vprod1x3 = vmull_s8(vb3, va1);
+        const int16x8_t vprod2x3 = vmull_s8(vb3, va2);
+        const int16x8_t vprod3x3 = vmull_s8(vb3, va3);
+        vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+        vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3);
+        vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3);
+        vacc3x3 = vpadalq_s16(vacc3x3, vprod3x3);
+        const int8x8_t vb4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x4 = vmull_s8(vb4, va0);
+        const int16x8_t vprod1x4 = vmull_s8(vb4, va1);
+        const int16x8_t vprod2x4 = vmull_s8(vb4, va2);
+        const int16x8_t vprod3x4 = vmull_s8(vb4, va3);
+        vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+        vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4);
+        vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4);
+        vacc3x4 = vpadalq_s16(vacc3x4, vprod3x4);
+        const int8x8_t vb5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x5 = vmull_s8(vb5, va0);
+        const int16x8_t vprod1x5 = vmull_s8(vb5, va1);
+        const int16x8_t vprod2x5 = vmull_s8(vb5, va2);
+        const int16x8_t vprod3x5 = vmull_s8(vb5, va3);
+        vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+        vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5);
+        vacc2x5 = vpadalq_s16(vacc2x5, vprod2x5);
+        vacc3x5 = vpadalq_s16(vacc3x5, vprod3x5);
+        const int8x8_t vb6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x6 = vmull_s8(vb6, va0);
+        const int16x8_t vprod1x6 = vmull_s8(vb6, va1);
+        const int16x8_t vprod2x6 = vmull_s8(vb6, va2);
+        const int16x8_t vprod3x6 = vmull_s8(vb6, va3);
+        vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+        vacc1x6 = vpadalq_s16(vacc1x6, vprod1x6);
+        vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6);
+        vacc3x6 = vpadalq_s16(vacc3x6, vprod3x6);
+        const int8x8_t vb7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x7 = vmull_s8(vb7, va0);
+        const int16x8_t vprod1x7 = vmull_s8(vb7, va1);
+        const int16x8_t vprod2x7 = vmull_s8(vb7, va2);
+        const int16x8_t vprod3x7 = vmull_s8(vb7, va3);
+        vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+        vacc1x7 = vpadalq_s16(vacc1x7, vprod1x7);
+        vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7);
+        vacc3x7 = vpadalq_s16(vacc3x7, vprod3x7);
+        const int8x8_t vb8 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x8 = vmull_s8(vb8, va0);
+        const int16x8_t vprod1x8 = vmull_s8(vb8, va1);
+        const int16x8_t vprod2x8 = vmull_s8(vb8, va2);
+        const int16x8_t vprod3x8 = vmull_s8(vb8, va3);
+        vacc0x8 = vpadalq_s16(vacc0x8, vprod0x8);
+        vacc1x8 = vpadalq_s16(vacc1x8, vprod1x8);
+        vacc2x8 = vpadalq_s16(vacc2x8, vprod2x8);
+        vacc3x8 = vpadalq_s16(vacc3x8, vprod3x8);
+        const int8x8_t vb9 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x9 = vmull_s8(vb9, va0);
+        const int16x8_t vprod1x9 = vmull_s8(vb9, va1);
+        const int16x8_t vprod2x9 = vmull_s8(vb9, va2);
+        const int16x8_t vprod3x9 = vmull_s8(vb9, va3);
+        vacc0x9 = vpadalq_s16(vacc0x9, vprod0x9);
+        vacc1x9 = vpadalq_s16(vacc1x9, vprod1x9);
+        vacc2x9 = vpadalq_s16(vacc2x9, vprod2x9);
+        vacc3x9 = vpadalq_s16(vacc3x9, vprod3x9);
+        const int8x8_t vb10 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x10 = vmull_s8(vb10, va0);
+        const int16x8_t vprod1x10 = vmull_s8(vb10, va1);
+        const int16x8_t vprod2x10 = vmull_s8(vb10, va2);
+        const int16x8_t vprod3x10 = vmull_s8(vb10, va3);
+        vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10);
+        vacc1x10 = vpadalq_s16(vacc1x10, vprod1x10);
+        vacc2x10 = vpadalq_s16(vacc2x10, vprod2x10);
+        vacc3x10 = vpadalq_s16(vacc3x10, vprod3x10);
+        const int8x8_t vb11 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x11 = vmull_s8(vb11, va0);
+        const int16x8_t vprod1x11 = vmull_s8(vb11, va1);
+        const int16x8_t vprod2x11 = vmull_s8(vb11, va2);
+        const int16x8_t vprod3x11 = vmull_s8(vb11, va3);
+        vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11);
+        vacc1x11 = vpadalq_s16(vacc1x11, vprod1x11);
+        vacc2x11 = vpadalq_s16(vacc2x11, vprod2x11);
+        vacc3x11 = vpadalq_s16(vacc3x11, vprod3x11);
+        const int8x8_t vb12 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x12 = vmull_s8(vb12, va0);
+        const int16x8_t vprod1x12 = vmull_s8(vb12, va1);
+        const int16x8_t vprod2x12 = vmull_s8(vb12, va2);
+        const int16x8_t vprod3x12 = vmull_s8(vb12, va3);
+        vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12);
+        vacc1x12 = vpadalq_s16(vacc1x12, vprod1x12);
+        vacc2x12 = vpadalq_s16(vacc2x12, vprod2x12);
+        vacc3x12 = vpadalq_s16(vacc3x12, vprod3x12);
+        const int8x8_t vb13 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x13 = vmull_s8(vb13, va0);
+        const int16x8_t vprod1x13 = vmull_s8(vb13, va1);
+        const int16x8_t vprod2x13 = vmull_s8(vb13, va2);
+        const int16x8_t vprod3x13 = vmull_s8(vb13, va3);
+        vacc0x13 = vpadalq_s16(vacc0x13, vprod0x13);
+        vacc1x13 = vpadalq_s16(vacc1x13, vprod1x13);
+        vacc2x13 = vpadalq_s16(vacc2x13, vprod2x13);
+        vacc3x13 = vpadalq_s16(vacc3x13, vprod3x13);
+        const int8x8_t vb14 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x14 = vmull_s8(vb14, va0);
+        const int16x8_t vprod1x14 = vmull_s8(vb14, va1);
+        const int16x8_t vprod2x14 = vmull_s8(vb14, va2);
+        const int16x8_t vprod3x14 = vmull_s8(vb14, va3);
+        vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14);
+        vacc1x14 = vpadalq_s16(vacc1x14, vprod1x14);
+        vacc2x14 = vpadalq_s16(vacc2x14, vprod2x14);
+        vacc3x14 = vpadalq_s16(vacc3x14, vprod3x14);
+        const int8x8_t vb15 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x15 = vmull_s8(vb15, va0);
+        const int16x8_t vprod1x15 = vmull_s8(vb15, va1);
+        const int16x8_t vprod2x15 = vmull_s8(vb15, va2);
+        const int16x8_t vprod3x15 = vmull_s8(vb15, va3);
+        vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15);
+        vacc1x15 = vpadalq_s16(vacc1x15, vprod1x15);
+        vacc2x15 = vpadalq_s16(vacc2x15, vprod2x15);
+        vacc3x15 = vpadalq_s16(vacc3x15, vprod3x15);
+
+        k -= 8 * sizeof(int8_t);
+      }
+
+      p -= 4 * sizeof(void*);
+    } while (p != 0);
+
+#if XNN_ARCH_ARM64
+    const int32x4_t vsum0x01 = vpaddq_s32(vacc0x0, vacc0x1);
+    const int32x4_t vsum0x23 = vpaddq_s32(vacc0x2, vacc0x3);
+    const int32x4_t vsum0x45 = vpaddq_s32(vacc0x4, vacc0x5);
+    const int32x4_t vsum0x67 = vpaddq_s32(vacc0x6, vacc0x7);
+    const int32x4_t vsum0x89 = vpaddq_s32(vacc0x8, vacc0x9);
+    const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11);
+    const int32x4_t vsum0xCD = vpaddq_s32(vacc0x12, vacc0x13);
+    const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15);
+    const int32x4_t vsum1x01 = vpaddq_s32(vacc1x0, vacc1x1);
+    const int32x4_t vsum1x23 = vpaddq_s32(vacc1x2, vacc1x3);
+    const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5);
+    const int32x4_t vsum1x67 = vpaddq_s32(vacc1x6, vacc1x7);
+    const int32x4_t vsum1x89 = vpaddq_s32(vacc1x8, vacc1x9);
+    const int32x4_t vsum1xAB = vpaddq_s32(vacc1x10, vacc1x11);
+    const int32x4_t vsum1xCD = vpaddq_s32(vacc1x12, vacc1x13);
+    const int32x4_t vsum1xEF = vpaddq_s32(vacc1x14, vacc1x15);
+    const int32x4_t vsum2x01 = vpaddq_s32(vacc2x0, vacc2x1);
+    const int32x4_t vsum2x23 = vpaddq_s32(vacc2x2, vacc2x3);
+    const int32x4_t vsum2x45 = vpaddq_s32(vacc2x4, vacc2x5);
+    const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7);
+    const int32x4_t vsum2x89 = vpaddq_s32(vacc2x8, vacc2x9);
+    const int32x4_t vsum2xAB = vpaddq_s32(vacc2x10, vacc2x11);
+    const int32x4_t vsum2xCD = vpaddq_s32(vacc2x12, vacc2x13);
+    const int32x4_t vsum2xEF = vpaddq_s32(vacc2x14, vacc2x15);
+    const int32x4_t vsum3x01 = vpaddq_s32(vacc3x0, vacc3x1);
+    const int32x4_t vsum3x23 = vpaddq_s32(vacc3x2, vacc3x3);
+    const int32x4_t vsum3x45 = vpaddq_s32(vacc3x4, vacc3x5);
+    const int32x4_t vsum3x67 = vpaddq_s32(vacc3x6, vacc3x7);
+    const int32x4_t vsum3x89 = vpaddq_s32(vacc3x8, vacc3x9);
+    const int32x4_t vsum3xAB = vpaddq_s32(vacc3x10, vacc3x11);
+    const int32x4_t vsum3xCD = vpaddq_s32(vacc3x12, vacc3x13);
+    const int32x4_t vsum3xEF = vpaddq_s32(vacc3x14, vacc3x15);
+    int32x4_t vacc0x0123 = vpaddq_s32(vsum0x01, vsum0x23);
+    int32x4_t vacc0x4567 = vpaddq_s32(vsum0x45, vsum0x67);
+    int32x4_t vacc0x89AB = vpaddq_s32(vsum0x89, vsum0xAB);
+    int32x4_t vacc0xCDEF = vpaddq_s32(vsum0xCD, vsum0xEF);
+    int32x4_t vacc1x0123 = vpaddq_s32(vsum1x01, vsum1x23);
+    int32x4_t vacc1x4567 = vpaddq_s32(vsum1x45, vsum1x67);
+    int32x4_t vacc1x89AB = vpaddq_s32(vsum1x89, vsum1xAB);
+    int32x4_t vacc1xCDEF = vpaddq_s32(vsum1xCD, vsum1xEF);
+    int32x4_t vacc2x0123 = vpaddq_s32(vsum2x01, vsum2x23);
+    int32x4_t vacc2x4567 = vpaddq_s32(vsum2x45, vsum2x67);
+    int32x4_t vacc2x89AB = vpaddq_s32(vsum2x89, vsum2xAB);
+    int32x4_t vacc2xCDEF = vpaddq_s32(vsum2xCD, vsum2xEF);
+    int32x4_t vacc3x0123 = vpaddq_s32(vsum3x01, vsum3x23);
+    int32x4_t vacc3x4567 = vpaddq_s32(vsum3x45, vsum3x67);
+    int32x4_t vacc3x89AB = vpaddq_s32(vsum3x89, vsum3xAB);
+    int32x4_t vacc3xCDEF = vpaddq_s32(vsum3xCD, vsum3xEF);
+#else
+    const int32x2_t vpsum0x0 = vadd_s32(vget_low_s32(vacc0x0), vget_high_s32(vacc0x0));
+    const int32x2_t vpsum0x1 = vadd_s32(vget_low_s32(vacc0x1), vget_high_s32(vacc0x1));
+    const int32x2_t vpsum0x2 = vadd_s32(vget_low_s32(vacc0x2), vget_high_s32(vacc0x2));
+    const int32x2_t vpsum0x3 = vadd_s32(vget_low_s32(vacc0x3), vget_high_s32(vacc0x3));
+    const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1);
+    const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3);
+    int32x4_t vacc0x0123 = vcombine_s32(vsum0x01, vsum0x23 );
+    const int32x2_t vpsum0x4 = vadd_s32(vget_low_s32(vacc0x4), vget_high_s32(vacc0x4));
+    const int32x2_t vpsum0x5 = vadd_s32(vget_low_s32(vacc0x5), vget_high_s32(vacc0x5));
+    const int32x2_t vpsum0x6 = vadd_s32(vget_low_s32(vacc0x6), vget_high_s32(vacc0x6));
+    const int32x2_t vpsum0x7 = vadd_s32(vget_low_s32(vacc0x7), vget_high_s32(vacc0x7));
+    const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5);
+    const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7);
+    int32x4_t vacc0x4567 = vcombine_s32(vsum0x45, vsum0x67 );
+    const int32x2_t vpsum0x8 = vadd_s32(vget_low_s32(vacc0x8), vget_high_s32(vacc0x8));
+    const int32x2_t vpsum0x9 = vadd_s32(vget_low_s32(vacc0x9), vget_high_s32(vacc0x9));
+    const int32x2_t vpsum0xA = vadd_s32(vget_low_s32(vacc0x10), vget_high_s32(vacc0x10));
+    const int32x2_t vpsum0xB = vadd_s32(vget_low_s32(vacc0x11), vget_high_s32(vacc0x11));
+    const int32x2_t vsum0x89 = vpadd_s32(vpsum0x8, vpsum0x9);
+    const int32x2_t vsum0xAB = vpadd_s32(vpsum0xA, vpsum0xB);
+    int32x4_t vacc0x89AB = vcombine_s32(vsum0x89, vsum0xAB );
+    const int32x2_t vpsum0xC = vadd_s32(vget_low_s32(vacc0x12), vget_high_s32(vacc0x12));
+    const int32x2_t vpsum0xD = vadd_s32(vget_low_s32(vacc0x13), vget_high_s32(vacc0x13));
+    const int32x2_t vpsum0xE = vadd_s32(vget_low_s32(vacc0x14), vget_high_s32(vacc0x14));
+    const int32x2_t vpsum0xF = vadd_s32(vget_low_s32(vacc0x15), vget_high_s32(vacc0x15));
+    const int32x2_t vsum0xCD = vpadd_s32(vpsum0xC, vpsum0xD);
+    const int32x2_t vsum0xEF = vpadd_s32(vpsum0xE, vpsum0xF);
+    int32x4_t vacc0xCDEF = vcombine_s32(vsum0xCD, vsum0xEF );
+    const int32x2_t vpsum1x0 = vadd_s32(vget_low_s32(vacc1x0), vget_high_s32(vacc1x0));
+    const int32x2_t vpsum1x1 = vadd_s32(vget_low_s32(vacc1x1), vget_high_s32(vacc1x1));
+    const int32x2_t vpsum1x2 = vadd_s32(vget_low_s32(vacc1x2), vget_high_s32(vacc1x2));
+    const int32x2_t vpsum1x3 = vadd_s32(vget_low_s32(vacc1x3), vget_high_s32(vacc1x3));
+    const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1);
+    const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3);
+    int32x4_t vacc1x0123 = vcombine_s32(vsum1x01, vsum1x23 );
+    const int32x2_t vpsum1x4 = vadd_s32(vget_low_s32(vacc1x4), vget_high_s32(vacc1x4));
+    const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5));
+    const int32x2_t vpsum1x6 = vadd_s32(vget_low_s32(vacc1x6), vget_high_s32(vacc1x6));
+    const int32x2_t vpsum1x7 = vadd_s32(vget_low_s32(vacc1x7), vget_high_s32(vacc1x7));
+    const int32x2_t vsum1x45 = vpadd_s32(vpsum1x4, vpsum1x5);
+    const int32x2_t vsum1x67 = vpadd_s32(vpsum1x6, vpsum1x7);
+    int32x4_t vacc1x4567 = vcombine_s32(vsum1x45, vsum1x67 );
+    const int32x2_t vpsum1x8 = vadd_s32(vget_low_s32(vacc1x8), vget_high_s32(vacc1x8));
+    const int32x2_t vpsum1x9 = vadd_s32(vget_low_s32(vacc1x9), vget_high_s32(vacc1x9));
+    const int32x2_t vpsum1xA = vadd_s32(vget_low_s32(vacc1x10), vget_high_s32(vacc1x10));
+    const int32x2_t vpsum1xB = vadd_s32(vget_low_s32(vacc1x11), vget_high_s32(vacc1x11));
+    const int32x2_t vsum1x89 = vpadd_s32(vpsum1x8, vpsum1x9);
+    const int32x2_t vsum1xAB = vpadd_s32(vpsum1xA, vpsum1xB);
+    int32x4_t vacc1x89AB = vcombine_s32(vsum1x89, vsum1xAB );
+    const int32x2_t vpsum1xC = vadd_s32(vget_low_s32(vacc1x12), vget_high_s32(vacc1x12));
+    const int32x2_t vpsum1xD = vadd_s32(vget_low_s32(vacc1x13), vget_high_s32(vacc1x13));
+    const int32x2_t vpsum1xE = vadd_s32(vget_low_s32(vacc1x14), vget_high_s32(vacc1x14));
+    const int32x2_t vpsum1xF = vadd_s32(vget_low_s32(vacc1x15), vget_high_s32(vacc1x15));
+    const int32x2_t vsum1xCD = vpadd_s32(vpsum1xC, vpsum1xD);
+    const int32x2_t vsum1xEF = vpadd_s32(vpsum1xE, vpsum1xF);
+    int32x4_t vacc1xCDEF = vcombine_s32(vsum1xCD, vsum1xEF );
+    const int32x2_t vpsum2x0 = vadd_s32(vget_low_s32(vacc2x0), vget_high_s32(vacc2x0));
+    const int32x2_t vpsum2x1 = vadd_s32(vget_low_s32(vacc2x1), vget_high_s32(vacc2x1));
+    const int32x2_t vpsum2x2 = vadd_s32(vget_low_s32(vacc2x2), vget_high_s32(vacc2x2));
+    const int32x2_t vpsum2x3 = vadd_s32(vget_low_s32(vacc2x3), vget_high_s32(vacc2x3));
+    const int32x2_t vsum2x01 = vpadd_s32(vpsum2x0, vpsum2x1);
+    const int32x2_t vsum2x23 = vpadd_s32(vpsum2x2, vpsum2x3);
+    int32x4_t vacc2x0123 = vcombine_s32(vsum2x01, vsum2x23 );
+    const int32x2_t vpsum2x4 = vadd_s32(vget_low_s32(vacc2x4), vget_high_s32(vacc2x4));
+    const int32x2_t vpsum2x5 = vadd_s32(vget_low_s32(vacc2x5), vget_high_s32(vacc2x5));
+    const int32x2_t vpsum2x6 = vadd_s32(vget_low_s32(vacc2x6), vget_high_s32(vacc2x6));
+    const int32x2_t vpsum2x7 = vadd_s32(vget_low_s32(vacc2x7), vget_high_s32(vacc2x7));
+    const int32x2_t vsum2x45 = vpadd_s32(vpsum2x4, vpsum2x5);
+    const int32x2_t vsum2x67 = vpadd_s32(vpsum2x6, vpsum2x7);
+    int32x4_t vacc2x4567 = vcombine_s32(vsum2x45, vsum2x67 );
+    const int32x2_t vpsum2x8 = vadd_s32(vget_low_s32(vacc2x8), vget_high_s32(vacc2x8));
+    const int32x2_t vpsum2x9 = vadd_s32(vget_low_s32(vacc2x9), vget_high_s32(vacc2x9));
+    const int32x2_t vpsum2xA = vadd_s32(vget_low_s32(vacc2x10), vget_high_s32(vacc2x10));
+    const int32x2_t vpsum2xB = vadd_s32(vget_low_s32(vacc2x11), vget_high_s32(vacc2x11));
+    const int32x2_t vsum2x89 = vpadd_s32(vpsum2x8, vpsum2x9);
+    const int32x2_t vsum2xAB = vpadd_s32(vpsum2xA, vpsum2xB);
+    int32x4_t vacc2x89AB = vcombine_s32(vsum2x89, vsum2xAB );
+    const int32x2_t vpsum2xC = vadd_s32(vget_low_s32(vacc2x12), vget_high_s32(vacc2x12));
+    const int32x2_t vpsum2xD = vadd_s32(vget_low_s32(vacc2x13), vget_high_s32(vacc2x13));
+    const int32x2_t vpsum2xE = vadd_s32(vget_low_s32(vacc2x14), vget_high_s32(vacc2x14));
+    const int32x2_t vpsum2xF = vadd_s32(vget_low_s32(vacc2x15), vget_high_s32(vacc2x15));
+    const int32x2_t vsum2xCD = vpadd_s32(vpsum2xC, vpsum2xD);
+    const int32x2_t vsum2xEF = vpadd_s32(vpsum2xE, vpsum2xF);
+    int32x4_t vacc2xCDEF = vcombine_s32(vsum2xCD, vsum2xEF );
+    const int32x2_t vpsum3x0 = vadd_s32(vget_low_s32(vacc3x0), vget_high_s32(vacc3x0));
+    const int32x2_t vpsum3x1 = vadd_s32(vget_low_s32(vacc3x1), vget_high_s32(vacc3x1));
+    const int32x2_t vpsum3x2 = vadd_s32(vget_low_s32(vacc3x2), vget_high_s32(vacc3x2));
+    const int32x2_t vpsum3x3 = vadd_s32(vget_low_s32(vacc3x3), vget_high_s32(vacc3x3));
+    const int32x2_t vsum3x01 = vpadd_s32(vpsum3x0, vpsum3x1);
+    const int32x2_t vsum3x23 = vpadd_s32(vpsum3x2, vpsum3x3);
+    int32x4_t vacc3x0123 = vcombine_s32(vsum3x01, vsum3x23 );
+    const int32x2_t vpsum3x4 = vadd_s32(vget_low_s32(vacc3x4), vget_high_s32(vacc3x4));
+    const int32x2_t vpsum3x5 = vadd_s32(vget_low_s32(vacc3x5), vget_high_s32(vacc3x5));
+    const int32x2_t vpsum3x6 = vadd_s32(vget_low_s32(vacc3x6), vget_high_s32(vacc3x6));
+    const int32x2_t vpsum3x7 = vadd_s32(vget_low_s32(vacc3x7), vget_high_s32(vacc3x7));
+    const int32x2_t vsum3x45 = vpadd_s32(vpsum3x4, vpsum3x5);
+    const int32x2_t vsum3x67 = vpadd_s32(vpsum3x6, vpsum3x7);
+    int32x4_t vacc3x4567 = vcombine_s32(vsum3x45, vsum3x67 );
+    const int32x2_t vpsum3x8 = vadd_s32(vget_low_s32(vacc3x8), vget_high_s32(vacc3x8));
+    const int32x2_t vpsum3x9 = vadd_s32(vget_low_s32(vacc3x9), vget_high_s32(vacc3x9));
+    const int32x2_t vpsum3xA = vadd_s32(vget_low_s32(vacc3x10), vget_high_s32(vacc3x10));
+    const int32x2_t vpsum3xB = vadd_s32(vget_low_s32(vacc3x11), vget_high_s32(vacc3x11));
+    const int32x2_t vsum3x89 = vpadd_s32(vpsum3x8, vpsum3x9);
+    const int32x2_t vsum3xAB = vpadd_s32(vpsum3xA, vpsum3xB);
+    int32x4_t vacc3x89AB = vcombine_s32(vsum3x89, vsum3xAB );
+    const int32x2_t vpsum3xC = vadd_s32(vget_low_s32(vacc3x12), vget_high_s32(vacc3x12));
+    const int32x2_t vpsum3xD = vadd_s32(vget_low_s32(vacc3x13), vget_high_s32(vacc3x13));
+    const int32x2_t vpsum3xE = vadd_s32(vget_low_s32(vacc3x14), vget_high_s32(vacc3x14));
+    const int32x2_t vpsum3xF = vadd_s32(vget_low_s32(vacc3x15), vget_high_s32(vacc3x15));
+    const int32x2_t vsum3xCD = vpadd_s32(vpsum3xC, vpsum3xD);
+    const int32x2_t vsum3xEF = vpadd_s32(vpsum3xE, vpsum3xF);
+    int32x4_t vacc3xCDEF = vcombine_s32(vsum3xCD, vsum3xEF );
+#endif
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+    vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc1x89AB = vqrdmulhq_s32(vacc1x89AB, vmultiplier);
+    vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+    vacc2x89AB = vqrdmulhq_s32(vacc2x89AB, vmultiplier);
+    vacc2xCDEF = vqrdmulhq_s32(vacc2xCDEF, vmultiplier);
+    vacc3x0123 = vqrdmulhq_s32(vacc3x0123, vmultiplier);
+    vacc3x4567 = vqrdmulhq_s32(vacc3x4567, vmultiplier);
+    vacc3x89AB = vqrdmulhq_s32(vacc3x89AB, vmultiplier);
+    vacc3xCDEF = vqrdmulhq_s32(vacc3xCDEF, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+    vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc1x89AB = vsraq_n_s32(vacc1x89AB, vbicq_s32(vacc1x89AB, vzero_shift_mask), 31);
+    vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+    vacc2x89AB = vsraq_n_s32(vacc2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31);
+    vacc2xCDEF = vsraq_n_s32(vacc2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31);
+    vacc3x0123 = vsraq_n_s32(vacc3x0123, vbicq_s32(vacc3x0123, vzero_shift_mask), 31);
+    vacc3x4567 = vsraq_n_s32(vacc3x4567, vbicq_s32(vacc3x4567, vzero_shift_mask), 31);
+    vacc3x89AB = vsraq_n_s32(vacc3x89AB, vbicq_s32(vacc3x89AB, vzero_shift_mask), 31);
+    vacc3xCDEF = vsraq_n_s32(vacc3xCDEF, vbicq_s32(vacc3xCDEF, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_shift);
+    vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+    vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_shift);
+    vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_shift);
+    vacc3x0123 = vrshlq_s32(vacc3x0123, vright_shift);
+    vacc3x4567 = vrshlq_s32(vacc3x4567, vright_shift);
+    vacc3x89AB = vrshlq_s32(vacc3x89AB, vright_shift);
+    vacc3xCDEF = vrshlq_s32(vacc3xCDEF, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
+    const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), voutput_zero_point);
+    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+    int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
+    int8x16_t vout2x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc2x89ABCDEF);
+    int8x16_t vout3x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc3x01234567), vacc3x89ABCDEF);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567)), voutput_zero_point);
+    const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF)), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+    int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
+    int8x16_t vout2x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc2x89ABCDEF));
+    int8x16_t vout3x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc3x01234567), vqmovn_s16(vacc3x89ABCDEF));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout3x0123456789ABCDEF = vmaxq_s8(vout3x0123456789ABCDEF, voutput_min);
+    vout2x0123456789ABCDEF = vmaxq_s8(vout2x0123456789ABCDEF, voutput_min);
+    vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
+    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+
+    vout3x0123456789ABCDEF = vminq_s8(vout3x0123456789ABCDEF, voutput_max);
+    vout2x0123456789ABCDEF = vminq_s8(vout2x0123456789ABCDEF, voutput_max);
+    vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
+    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+
+    if (nc >= 16) {
+      vst1q_s8(c3 + 0, vout3x0123456789ABCDEF);
+      vst1q_s8(c2 + 0, vout2x0123456789ABCDEF);
+      vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
+      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+
+      c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 16;
+    } else {
+      int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
+      int8x16_t vout2x01234567_3x01234567 = vcombine_s8(vget_low_s8(vout2x0123456789ABCDEF), vget_low_s8(vout3x0123456789ABCDEF));
+      if (nc & 8) {
+        vst1_s8(c3, vget_high_s8(vout2x01234567_3x01234567)); c3 += 8;
+        vst1_s8(c2, vget_low_s8(vout2x01234567_3x01234567)); c2 += 8;
+        vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
+        vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
+        vout2x01234567_3x01234567 = vcombine_s8(vget_high_s8(vout2x0123456789ABCDEF), vget_high_s8(vout3x0123456789ABCDEF));
+        vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
+      }
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c3, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 2); c3 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 0); c2 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c3, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 4); c3 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 0); c2 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c3, vout2x01234567_3x01234567, 8);
+        vst1q_lane_s8(c2, vout2x01234567_3x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/4x16c8-minmax-neon-mull-padal.c b/src/qs8-igemm/gen/4x16c8-minmax-neon-mull-padal.c
new file mode 100644
index 0000000..ee1a86d
--- /dev/null
+++ b/src/qs8-igemm/gen/4x16c8-minmax-neon-mull-padal.c
@@ -0,0 +1,615 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/c8-neon-mull-padal.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (4 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+  int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    c3 = c2;
+  }
+
+  do {
+    int32x4_t vacc0x0 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x1 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x2 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x3 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x4 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x5 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x8 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x9 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x10 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x11 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x12 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x13 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x14 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x15 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc1x0 = vacc0x0;
+    int32x4_t vacc1x1 = vacc0x1;
+    int32x4_t vacc1x2 = vacc0x2;
+    int32x4_t vacc1x3 = vacc0x3;
+    int32x4_t vacc1x4 = vacc0x4;
+    int32x4_t vacc1x5 = vacc0x5;
+    int32x4_t vacc1x6 = vacc0x6;
+    int32x4_t vacc1x7 = vacc0x7;
+    int32x4_t vacc1x8 = vacc0x8;
+    int32x4_t vacc1x9 = vacc0x9;
+    int32x4_t vacc1x10 = vacc0x10;
+    int32x4_t vacc1x11 = vacc0x11;
+    int32x4_t vacc1x12 = vacc0x12;
+    int32x4_t vacc1x13 = vacc0x13;
+    int32x4_t vacc1x14 = vacc0x14;
+    int32x4_t vacc1x15 = vacc0x15;
+    int32x4_t vacc2x0 = vacc0x0;
+    int32x4_t vacc2x1 = vacc0x1;
+    int32x4_t vacc2x2 = vacc0x2;
+    int32x4_t vacc2x3 = vacc0x3;
+    int32x4_t vacc2x4 = vacc0x4;
+    int32x4_t vacc2x5 = vacc0x5;
+    int32x4_t vacc2x6 = vacc0x6;
+    int32x4_t vacc2x7 = vacc0x7;
+    int32x4_t vacc2x8 = vacc0x8;
+    int32x4_t vacc2x9 = vacc0x9;
+    int32x4_t vacc2x10 = vacc0x10;
+    int32x4_t vacc2x11 = vacc0x11;
+    int32x4_t vacc2x12 = vacc0x12;
+    int32x4_t vacc2x13 = vacc0x13;
+    int32x4_t vacc2x14 = vacc0x14;
+    int32x4_t vacc2x15 = vacc0x15;
+    int32x4_t vacc3x0 = vacc0x0;
+    int32x4_t vacc3x1 = vacc0x1;
+    int32x4_t vacc3x2 = vacc0x2;
+    int32x4_t vacc3x3 = vacc0x3;
+    int32x4_t vacc3x4 = vacc0x4;
+    int32x4_t vacc3x5 = vacc0x5;
+    int32x4_t vacc3x6 = vacc0x6;
+    int32x4_t vacc3x7 = vacc0x7;
+    int32x4_t vacc3x8 = vacc0x8;
+    int32x4_t vacc3x9 = vacc0x9;
+    int32x4_t vacc3x10 = vacc0x10;
+    int32x4_t vacc3x11 = vacc0x11;
+    int32x4_t vacc3x12 = vacc0x12;
+    int32x4_t vacc3x13 = vacc0x13;
+    int32x4_t vacc3x14 = vacc0x14;
+    int32x4_t vacc3x15 = vacc0x15;
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      const int8_t* restrict a3 = a[3];
+      if XNN_UNPREDICTABLE(a3 != zero) {
+        a3 = (const int8_t*) ((uintptr_t) a3 + a_offset);
+      }
+      a += 4;
+
+      size_t k = kc;
+
+      // Handle 8 bytes at a time using MUL.
+      while (k > 0) {
+        const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+        const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+        const int8x8_t va3 = vld1_s8(a3); a3 += 8;
+
+        const int8x8_t vb0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x0 = vmull_s8(vb0, va0);
+        const int16x8_t vprod1x0 = vmull_s8(vb0, va1);
+        const int16x8_t vprod2x0 = vmull_s8(vb0, va2);
+        const int16x8_t vprod3x0 = vmull_s8(vb0, va3);
+        vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+        vacc1x0 = vpadalq_s16(vacc1x0, vprod1x0);
+        vacc2x0 = vpadalq_s16(vacc2x0, vprod2x0);
+        vacc3x0 = vpadalq_s16(vacc3x0, vprod3x0);
+        const int8x8_t vb1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x1 = vmull_s8(vb1, va0);
+        const int16x8_t vprod1x1 = vmull_s8(vb1, va1);
+        const int16x8_t vprod2x1 = vmull_s8(vb1, va2);
+        const int16x8_t vprod3x1 = vmull_s8(vb1, va3);
+        vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+        vacc1x1 = vpadalq_s16(vacc1x1, vprod1x1);
+        vacc2x1 = vpadalq_s16(vacc2x1, vprod2x1);
+        vacc3x1 = vpadalq_s16(vacc3x1, vprod3x1);
+        const int8x8_t vb2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x2 = vmull_s8(vb2, va0);
+        const int16x8_t vprod1x2 = vmull_s8(vb2, va1);
+        const int16x8_t vprod2x2 = vmull_s8(vb2, va2);
+        const int16x8_t vprod3x2 = vmull_s8(vb2, va3);
+        vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+        vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2);
+        vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2);
+        vacc3x2 = vpadalq_s16(vacc3x2, vprod3x2);
+        const int8x8_t vb3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x3 = vmull_s8(vb3, va0);
+        const int16x8_t vprod1x3 = vmull_s8(vb3, va1);
+        const int16x8_t vprod2x3 = vmull_s8(vb3, va2);
+        const int16x8_t vprod3x3 = vmull_s8(vb3, va3);
+        vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+        vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3);
+        vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3);
+        vacc3x3 = vpadalq_s16(vacc3x3, vprod3x3);
+        const int8x8_t vb4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x4 = vmull_s8(vb4, va0);
+        const int16x8_t vprod1x4 = vmull_s8(vb4, va1);
+        const int16x8_t vprod2x4 = vmull_s8(vb4, va2);
+        const int16x8_t vprod3x4 = vmull_s8(vb4, va3);
+        vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+        vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4);
+        vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4);
+        vacc3x4 = vpadalq_s16(vacc3x4, vprod3x4);
+        const int8x8_t vb5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x5 = vmull_s8(vb5, va0);
+        const int16x8_t vprod1x5 = vmull_s8(vb5, va1);
+        const int16x8_t vprod2x5 = vmull_s8(vb5, va2);
+        const int16x8_t vprod3x5 = vmull_s8(vb5, va3);
+        vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+        vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5);
+        vacc2x5 = vpadalq_s16(vacc2x5, vprod2x5);
+        vacc3x5 = vpadalq_s16(vacc3x5, vprod3x5);
+        const int8x8_t vb6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x6 = vmull_s8(vb6, va0);
+        const int16x8_t vprod1x6 = vmull_s8(vb6, va1);
+        const int16x8_t vprod2x6 = vmull_s8(vb6, va2);
+        const int16x8_t vprod3x6 = vmull_s8(vb6, va3);
+        vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+        vacc1x6 = vpadalq_s16(vacc1x6, vprod1x6);
+        vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6);
+        vacc3x6 = vpadalq_s16(vacc3x6, vprod3x6);
+        const int8x8_t vb7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x7 = vmull_s8(vb7, va0);
+        const int16x8_t vprod1x7 = vmull_s8(vb7, va1);
+        const int16x8_t vprod2x7 = vmull_s8(vb7, va2);
+        const int16x8_t vprod3x7 = vmull_s8(vb7, va3);
+        vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+        vacc1x7 = vpadalq_s16(vacc1x7, vprod1x7);
+        vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7);
+        vacc3x7 = vpadalq_s16(vacc3x7, vprod3x7);
+        const int8x8_t vb8 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x8 = vmull_s8(vb8, va0);
+        const int16x8_t vprod1x8 = vmull_s8(vb8, va1);
+        const int16x8_t vprod2x8 = vmull_s8(vb8, va2);
+        const int16x8_t vprod3x8 = vmull_s8(vb8, va3);
+        vacc0x8 = vpadalq_s16(vacc0x8, vprod0x8);
+        vacc1x8 = vpadalq_s16(vacc1x8, vprod1x8);
+        vacc2x8 = vpadalq_s16(vacc2x8, vprod2x8);
+        vacc3x8 = vpadalq_s16(vacc3x8, vprod3x8);
+        const int8x8_t vb9 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x9 = vmull_s8(vb9, va0);
+        const int16x8_t vprod1x9 = vmull_s8(vb9, va1);
+        const int16x8_t vprod2x9 = vmull_s8(vb9, va2);
+        const int16x8_t vprod3x9 = vmull_s8(vb9, va3);
+        vacc0x9 = vpadalq_s16(vacc0x9, vprod0x9);
+        vacc1x9 = vpadalq_s16(vacc1x9, vprod1x9);
+        vacc2x9 = vpadalq_s16(vacc2x9, vprod2x9);
+        vacc3x9 = vpadalq_s16(vacc3x9, vprod3x9);
+        const int8x8_t vb10 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x10 = vmull_s8(vb10, va0);
+        const int16x8_t vprod1x10 = vmull_s8(vb10, va1);
+        const int16x8_t vprod2x10 = vmull_s8(vb10, va2);
+        const int16x8_t vprod3x10 = vmull_s8(vb10, va3);
+        vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10);
+        vacc1x10 = vpadalq_s16(vacc1x10, vprod1x10);
+        vacc2x10 = vpadalq_s16(vacc2x10, vprod2x10);
+        vacc3x10 = vpadalq_s16(vacc3x10, vprod3x10);
+        const int8x8_t vb11 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x11 = vmull_s8(vb11, va0);
+        const int16x8_t vprod1x11 = vmull_s8(vb11, va1);
+        const int16x8_t vprod2x11 = vmull_s8(vb11, va2);
+        const int16x8_t vprod3x11 = vmull_s8(vb11, va3);
+        vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11);
+        vacc1x11 = vpadalq_s16(vacc1x11, vprod1x11);
+        vacc2x11 = vpadalq_s16(vacc2x11, vprod2x11);
+        vacc3x11 = vpadalq_s16(vacc3x11, vprod3x11);
+        const int8x8_t vb12 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x12 = vmull_s8(vb12, va0);
+        const int16x8_t vprod1x12 = vmull_s8(vb12, va1);
+        const int16x8_t vprod2x12 = vmull_s8(vb12, va2);
+        const int16x8_t vprod3x12 = vmull_s8(vb12, va3);
+        vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12);
+        vacc1x12 = vpadalq_s16(vacc1x12, vprod1x12);
+        vacc2x12 = vpadalq_s16(vacc2x12, vprod2x12);
+        vacc3x12 = vpadalq_s16(vacc3x12, vprod3x12);
+        const int8x8_t vb13 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x13 = vmull_s8(vb13, va0);
+        const int16x8_t vprod1x13 = vmull_s8(vb13, va1);
+        const int16x8_t vprod2x13 = vmull_s8(vb13, va2);
+        const int16x8_t vprod3x13 = vmull_s8(vb13, va3);
+        vacc0x13 = vpadalq_s16(vacc0x13, vprod0x13);
+        vacc1x13 = vpadalq_s16(vacc1x13, vprod1x13);
+        vacc2x13 = vpadalq_s16(vacc2x13, vprod2x13);
+        vacc3x13 = vpadalq_s16(vacc3x13, vprod3x13);
+        const int8x8_t vb14 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x14 = vmull_s8(vb14, va0);
+        const int16x8_t vprod1x14 = vmull_s8(vb14, va1);
+        const int16x8_t vprod2x14 = vmull_s8(vb14, va2);
+        const int16x8_t vprod3x14 = vmull_s8(vb14, va3);
+        vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14);
+        vacc1x14 = vpadalq_s16(vacc1x14, vprod1x14);
+        vacc2x14 = vpadalq_s16(vacc2x14, vprod2x14);
+        vacc3x14 = vpadalq_s16(vacc3x14, vprod3x14);
+        const int8x8_t vb15 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x15 = vmull_s8(vb15, va0);
+        const int16x8_t vprod1x15 = vmull_s8(vb15, va1);
+        const int16x8_t vprod2x15 = vmull_s8(vb15, va2);
+        const int16x8_t vprod3x15 = vmull_s8(vb15, va3);
+        vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15);
+        vacc1x15 = vpadalq_s16(vacc1x15, vprod1x15);
+        vacc2x15 = vpadalq_s16(vacc2x15, vprod2x15);
+        vacc3x15 = vpadalq_s16(vacc3x15, vprod3x15);
+
+        k -= 8 * sizeof(int8_t);
+      }
+
+      p -= 4 * sizeof(void*);
+    } while (p != 0);
+
+#if XNN_ARCH_ARM64
+    const int32x4_t vsum0x01 = vpaddq_s32(vacc0x0, vacc0x1);
+    const int32x4_t vsum0x23 = vpaddq_s32(vacc0x2, vacc0x3);
+    const int32x4_t vsum0x45 = vpaddq_s32(vacc0x4, vacc0x5);
+    const int32x4_t vsum0x67 = vpaddq_s32(vacc0x6, vacc0x7);
+    const int32x4_t vsum0x89 = vpaddq_s32(vacc0x8, vacc0x9);
+    const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11);
+    const int32x4_t vsum0xCD = vpaddq_s32(vacc0x12, vacc0x13);
+    const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15);
+    const int32x4_t vsum1x01 = vpaddq_s32(vacc1x0, vacc1x1);
+    const int32x4_t vsum1x23 = vpaddq_s32(vacc1x2, vacc1x3);
+    const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5);
+    const int32x4_t vsum1x67 = vpaddq_s32(vacc1x6, vacc1x7);
+    const int32x4_t vsum1x89 = vpaddq_s32(vacc1x8, vacc1x9);
+    const int32x4_t vsum1xAB = vpaddq_s32(vacc1x10, vacc1x11);
+    const int32x4_t vsum1xCD = vpaddq_s32(vacc1x12, vacc1x13);
+    const int32x4_t vsum1xEF = vpaddq_s32(vacc1x14, vacc1x15);
+    const int32x4_t vsum2x01 = vpaddq_s32(vacc2x0, vacc2x1);
+    const int32x4_t vsum2x23 = vpaddq_s32(vacc2x2, vacc2x3);
+    const int32x4_t vsum2x45 = vpaddq_s32(vacc2x4, vacc2x5);
+    const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7);
+    const int32x4_t vsum2x89 = vpaddq_s32(vacc2x8, vacc2x9);
+    const int32x4_t vsum2xAB = vpaddq_s32(vacc2x10, vacc2x11);
+    const int32x4_t vsum2xCD = vpaddq_s32(vacc2x12, vacc2x13);
+    const int32x4_t vsum2xEF = vpaddq_s32(vacc2x14, vacc2x15);
+    const int32x4_t vsum3x01 = vpaddq_s32(vacc3x0, vacc3x1);
+    const int32x4_t vsum3x23 = vpaddq_s32(vacc3x2, vacc3x3);
+    const int32x4_t vsum3x45 = vpaddq_s32(vacc3x4, vacc3x5);
+    const int32x4_t vsum3x67 = vpaddq_s32(vacc3x6, vacc3x7);
+    const int32x4_t vsum3x89 = vpaddq_s32(vacc3x8, vacc3x9);
+    const int32x4_t vsum3xAB = vpaddq_s32(vacc3x10, vacc3x11);
+    const int32x4_t vsum3xCD = vpaddq_s32(vacc3x12, vacc3x13);
+    const int32x4_t vsum3xEF = vpaddq_s32(vacc3x14, vacc3x15);
+    int32x4_t vacc0x0123 = vpaddq_s32(vsum0x01, vsum0x23);
+    int32x4_t vacc0x4567 = vpaddq_s32(vsum0x45, vsum0x67);
+    int32x4_t vacc0x89AB = vpaddq_s32(vsum0x89, vsum0xAB);
+    int32x4_t vacc0xCDEF = vpaddq_s32(vsum0xCD, vsum0xEF);
+    int32x4_t vacc1x0123 = vpaddq_s32(vsum1x01, vsum1x23);
+    int32x4_t vacc1x4567 = vpaddq_s32(vsum1x45, vsum1x67);
+    int32x4_t vacc1x89AB = vpaddq_s32(vsum1x89, vsum1xAB);
+    int32x4_t vacc1xCDEF = vpaddq_s32(vsum1xCD, vsum1xEF);
+    int32x4_t vacc2x0123 = vpaddq_s32(vsum2x01, vsum2x23);
+    int32x4_t vacc2x4567 = vpaddq_s32(vsum2x45, vsum2x67);
+    int32x4_t vacc2x89AB = vpaddq_s32(vsum2x89, vsum2xAB);
+    int32x4_t vacc2xCDEF = vpaddq_s32(vsum2xCD, vsum2xEF);
+    int32x4_t vacc3x0123 = vpaddq_s32(vsum3x01, vsum3x23);
+    int32x4_t vacc3x4567 = vpaddq_s32(vsum3x45, vsum3x67);
+    int32x4_t vacc3x89AB = vpaddq_s32(vsum3x89, vsum3xAB);
+    int32x4_t vacc3xCDEF = vpaddq_s32(vsum3xCD, vsum3xEF);
+#else
+    const int32x2_t vpsum0x0 = vadd_s32(vget_low_s32(vacc0x0), vget_high_s32(vacc0x0));
+    const int32x2_t vpsum0x1 = vadd_s32(vget_low_s32(vacc0x1), vget_high_s32(vacc0x1));
+    const int32x2_t vpsum0x2 = vadd_s32(vget_low_s32(vacc0x2), vget_high_s32(vacc0x2));
+    const int32x2_t vpsum0x3 = vadd_s32(vget_low_s32(vacc0x3), vget_high_s32(vacc0x3));
+    const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1);
+    const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3);
+    int32x4_t vacc0x0123 = vcombine_s32(vsum0x01, vsum0x23 );
+    const int32x2_t vpsum0x4 = vadd_s32(vget_low_s32(vacc0x4), vget_high_s32(vacc0x4));
+    const int32x2_t vpsum0x5 = vadd_s32(vget_low_s32(vacc0x5), vget_high_s32(vacc0x5));
+    const int32x2_t vpsum0x6 = vadd_s32(vget_low_s32(vacc0x6), vget_high_s32(vacc0x6));
+    const int32x2_t vpsum0x7 = vadd_s32(vget_low_s32(vacc0x7), vget_high_s32(vacc0x7));
+    const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5);
+    const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7);
+    int32x4_t vacc0x4567 = vcombine_s32(vsum0x45, vsum0x67 );
+    const int32x2_t vpsum0x8 = vadd_s32(vget_low_s32(vacc0x8), vget_high_s32(vacc0x8));
+    const int32x2_t vpsum0x9 = vadd_s32(vget_low_s32(vacc0x9), vget_high_s32(vacc0x9));
+    const int32x2_t vpsum0xA = vadd_s32(vget_low_s32(vacc0x10), vget_high_s32(vacc0x10));
+    const int32x2_t vpsum0xB = vadd_s32(vget_low_s32(vacc0x11), vget_high_s32(vacc0x11));
+    const int32x2_t vsum0x89 = vpadd_s32(vpsum0x8, vpsum0x9);
+    const int32x2_t vsum0xAB = vpadd_s32(vpsum0xA, vpsum0xB);
+    int32x4_t vacc0x89AB = vcombine_s32(vsum0x89, vsum0xAB );
+    const int32x2_t vpsum0xC = vadd_s32(vget_low_s32(vacc0x12), vget_high_s32(vacc0x12));
+    const int32x2_t vpsum0xD = vadd_s32(vget_low_s32(vacc0x13), vget_high_s32(vacc0x13));
+    const int32x2_t vpsum0xE = vadd_s32(vget_low_s32(vacc0x14), vget_high_s32(vacc0x14));
+    const int32x2_t vpsum0xF = vadd_s32(vget_low_s32(vacc0x15), vget_high_s32(vacc0x15));
+    const int32x2_t vsum0xCD = vpadd_s32(vpsum0xC, vpsum0xD);
+    const int32x2_t vsum0xEF = vpadd_s32(vpsum0xE, vpsum0xF);
+    int32x4_t vacc0xCDEF = vcombine_s32(vsum0xCD, vsum0xEF );
+    const int32x2_t vpsum1x0 = vadd_s32(vget_low_s32(vacc1x0), vget_high_s32(vacc1x0));
+    const int32x2_t vpsum1x1 = vadd_s32(vget_low_s32(vacc1x1), vget_high_s32(vacc1x1));
+    const int32x2_t vpsum1x2 = vadd_s32(vget_low_s32(vacc1x2), vget_high_s32(vacc1x2));
+    const int32x2_t vpsum1x3 = vadd_s32(vget_low_s32(vacc1x3), vget_high_s32(vacc1x3));
+    const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1);
+    const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3);
+    int32x4_t vacc1x0123 = vcombine_s32(vsum1x01, vsum1x23 );
+    const int32x2_t vpsum1x4 = vadd_s32(vget_low_s32(vacc1x4), vget_high_s32(vacc1x4));
+    const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5));
+    const int32x2_t vpsum1x6 = vadd_s32(vget_low_s32(vacc1x6), vget_high_s32(vacc1x6));
+    const int32x2_t vpsum1x7 = vadd_s32(vget_low_s32(vacc1x7), vget_high_s32(vacc1x7));
+    const int32x2_t vsum1x45 = vpadd_s32(vpsum1x4, vpsum1x5);
+    const int32x2_t vsum1x67 = vpadd_s32(vpsum1x6, vpsum1x7);
+    int32x4_t vacc1x4567 = vcombine_s32(vsum1x45, vsum1x67 );
+    const int32x2_t vpsum1x8 = vadd_s32(vget_low_s32(vacc1x8), vget_high_s32(vacc1x8));
+    const int32x2_t vpsum1x9 = vadd_s32(vget_low_s32(vacc1x9), vget_high_s32(vacc1x9));
+    const int32x2_t vpsum1xA = vadd_s32(vget_low_s32(vacc1x10), vget_high_s32(vacc1x10));
+    const int32x2_t vpsum1xB = vadd_s32(vget_low_s32(vacc1x11), vget_high_s32(vacc1x11));
+    const int32x2_t vsum1x89 = vpadd_s32(vpsum1x8, vpsum1x9);
+    const int32x2_t vsum1xAB = vpadd_s32(vpsum1xA, vpsum1xB);
+    int32x4_t vacc1x89AB = vcombine_s32(vsum1x89, vsum1xAB );
+    const int32x2_t vpsum1xC = vadd_s32(vget_low_s32(vacc1x12), vget_high_s32(vacc1x12));
+    const int32x2_t vpsum1xD = vadd_s32(vget_low_s32(vacc1x13), vget_high_s32(vacc1x13));
+    const int32x2_t vpsum1xE = vadd_s32(vget_low_s32(vacc1x14), vget_high_s32(vacc1x14));
+    const int32x2_t vpsum1xF = vadd_s32(vget_low_s32(vacc1x15), vget_high_s32(vacc1x15));
+    const int32x2_t vsum1xCD = vpadd_s32(vpsum1xC, vpsum1xD);
+    const int32x2_t vsum1xEF = vpadd_s32(vpsum1xE, vpsum1xF);
+    int32x4_t vacc1xCDEF = vcombine_s32(vsum1xCD, vsum1xEF );
+    const int32x2_t vpsum2x0 = vadd_s32(vget_low_s32(vacc2x0), vget_high_s32(vacc2x0));
+    const int32x2_t vpsum2x1 = vadd_s32(vget_low_s32(vacc2x1), vget_high_s32(vacc2x1));
+    const int32x2_t vpsum2x2 = vadd_s32(vget_low_s32(vacc2x2), vget_high_s32(vacc2x2));
+    const int32x2_t vpsum2x3 = vadd_s32(vget_low_s32(vacc2x3), vget_high_s32(vacc2x3));
+    const int32x2_t vsum2x01 = vpadd_s32(vpsum2x0, vpsum2x1);
+    const int32x2_t vsum2x23 = vpadd_s32(vpsum2x2, vpsum2x3);
+    int32x4_t vacc2x0123 = vcombine_s32(vsum2x01, vsum2x23 );
+    const int32x2_t vpsum2x4 = vadd_s32(vget_low_s32(vacc2x4), vget_high_s32(vacc2x4));
+    const int32x2_t vpsum2x5 = vadd_s32(vget_low_s32(vacc2x5), vget_high_s32(vacc2x5));
+    const int32x2_t vpsum2x6 = vadd_s32(vget_low_s32(vacc2x6), vget_high_s32(vacc2x6));
+    const int32x2_t vpsum2x7 = vadd_s32(vget_low_s32(vacc2x7), vget_high_s32(vacc2x7));
+    const int32x2_t vsum2x45 = vpadd_s32(vpsum2x4, vpsum2x5);
+    const int32x2_t vsum2x67 = vpadd_s32(vpsum2x6, vpsum2x7);
+    int32x4_t vacc2x4567 = vcombine_s32(vsum2x45, vsum2x67 );
+    const int32x2_t vpsum2x8 = vadd_s32(vget_low_s32(vacc2x8), vget_high_s32(vacc2x8));
+    const int32x2_t vpsum2x9 = vadd_s32(vget_low_s32(vacc2x9), vget_high_s32(vacc2x9));
+    const int32x2_t vpsum2xA = vadd_s32(vget_low_s32(vacc2x10), vget_high_s32(vacc2x10));
+    const int32x2_t vpsum2xB = vadd_s32(vget_low_s32(vacc2x11), vget_high_s32(vacc2x11));
+    const int32x2_t vsum2x89 = vpadd_s32(vpsum2x8, vpsum2x9);
+    const int32x2_t vsum2xAB = vpadd_s32(vpsum2xA, vpsum2xB);
+    int32x4_t vacc2x89AB = vcombine_s32(vsum2x89, vsum2xAB );
+    const int32x2_t vpsum2xC = vadd_s32(vget_low_s32(vacc2x12), vget_high_s32(vacc2x12));
+    const int32x2_t vpsum2xD = vadd_s32(vget_low_s32(vacc2x13), vget_high_s32(vacc2x13));
+    const int32x2_t vpsum2xE = vadd_s32(vget_low_s32(vacc2x14), vget_high_s32(vacc2x14));
+    const int32x2_t vpsum2xF = vadd_s32(vget_low_s32(vacc2x15), vget_high_s32(vacc2x15));
+    const int32x2_t vsum2xCD = vpadd_s32(vpsum2xC, vpsum2xD);
+    const int32x2_t vsum2xEF = vpadd_s32(vpsum2xE, vpsum2xF);
+    int32x4_t vacc2xCDEF = vcombine_s32(vsum2xCD, vsum2xEF );
+    const int32x2_t vpsum3x0 = vadd_s32(vget_low_s32(vacc3x0), vget_high_s32(vacc3x0));
+    const int32x2_t vpsum3x1 = vadd_s32(vget_low_s32(vacc3x1), vget_high_s32(vacc3x1));
+    const int32x2_t vpsum3x2 = vadd_s32(vget_low_s32(vacc3x2), vget_high_s32(vacc3x2));
+    const int32x2_t vpsum3x3 = vadd_s32(vget_low_s32(vacc3x3), vget_high_s32(vacc3x3));
+    const int32x2_t vsum3x01 = vpadd_s32(vpsum3x0, vpsum3x1);
+    const int32x2_t vsum3x23 = vpadd_s32(vpsum3x2, vpsum3x3);
+    int32x4_t vacc3x0123 = vcombine_s32(vsum3x01, vsum3x23 );
+    const int32x2_t vpsum3x4 = vadd_s32(vget_low_s32(vacc3x4), vget_high_s32(vacc3x4));
+    const int32x2_t vpsum3x5 = vadd_s32(vget_low_s32(vacc3x5), vget_high_s32(vacc3x5));
+    const int32x2_t vpsum3x6 = vadd_s32(vget_low_s32(vacc3x6), vget_high_s32(vacc3x6));
+    const int32x2_t vpsum3x7 = vadd_s32(vget_low_s32(vacc3x7), vget_high_s32(vacc3x7));
+    const int32x2_t vsum3x45 = vpadd_s32(vpsum3x4, vpsum3x5);
+    const int32x2_t vsum3x67 = vpadd_s32(vpsum3x6, vpsum3x7);
+    int32x4_t vacc3x4567 = vcombine_s32(vsum3x45, vsum3x67 );
+    const int32x2_t vpsum3x8 = vadd_s32(vget_low_s32(vacc3x8), vget_high_s32(vacc3x8));
+    const int32x2_t vpsum3x9 = vadd_s32(vget_low_s32(vacc3x9), vget_high_s32(vacc3x9));
+    const int32x2_t vpsum3xA = vadd_s32(vget_low_s32(vacc3x10), vget_high_s32(vacc3x10));
+    const int32x2_t vpsum3xB = vadd_s32(vget_low_s32(vacc3x11), vget_high_s32(vacc3x11));
+    const int32x2_t vsum3x89 = vpadd_s32(vpsum3x8, vpsum3x9);
+    const int32x2_t vsum3xAB = vpadd_s32(vpsum3xA, vpsum3xB);
+    int32x4_t vacc3x89AB = vcombine_s32(vsum3x89, vsum3xAB );
+    const int32x2_t vpsum3xC = vadd_s32(vget_low_s32(vacc3x12), vget_high_s32(vacc3x12));
+    const int32x2_t vpsum3xD = vadd_s32(vget_low_s32(vacc3x13), vget_high_s32(vacc3x13));
+    const int32x2_t vpsum3xE = vadd_s32(vget_low_s32(vacc3x14), vget_high_s32(vacc3x14));
+    const int32x2_t vpsum3xF = vadd_s32(vget_low_s32(vacc3x15), vget_high_s32(vacc3x15));
+    const int32x2_t vsum3xCD = vpadd_s32(vpsum3xC, vpsum3xD);
+    const int32x2_t vsum3xEF = vpadd_s32(vpsum3xE, vpsum3xF);
+    int32x4_t vacc3xCDEF = vcombine_s32(vsum3xCD, vsum3xEF );
+#endif
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+    vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc1x89AB = vqrdmulhq_s32(vacc1x89AB, vmultiplier);
+    vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+    vacc2x89AB = vqrdmulhq_s32(vacc2x89AB, vmultiplier);
+    vacc2xCDEF = vqrdmulhq_s32(vacc2xCDEF, vmultiplier);
+    vacc3x0123 = vqrdmulhq_s32(vacc3x0123, vmultiplier);
+    vacc3x4567 = vqrdmulhq_s32(vacc3x4567, vmultiplier);
+    vacc3x89AB = vqrdmulhq_s32(vacc3x89AB, vmultiplier);
+    vacc3xCDEF = vqrdmulhq_s32(vacc3xCDEF, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+    vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc1x89AB = vsraq_n_s32(vacc1x89AB, vbicq_s32(vacc1x89AB, vzero_shift_mask), 31);
+    vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+    vacc2x89AB = vsraq_n_s32(vacc2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31);
+    vacc2xCDEF = vsraq_n_s32(vacc2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31);
+    vacc3x0123 = vsraq_n_s32(vacc3x0123, vbicq_s32(vacc3x0123, vzero_shift_mask), 31);
+    vacc3x4567 = vsraq_n_s32(vacc3x4567, vbicq_s32(vacc3x4567, vzero_shift_mask), 31);
+    vacc3x89AB = vsraq_n_s32(vacc3x89AB, vbicq_s32(vacc3x89AB, vzero_shift_mask), 31);
+    vacc3xCDEF = vsraq_n_s32(vacc3xCDEF, vbicq_s32(vacc3xCDEF, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_shift);
+    vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+    vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_shift);
+    vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_shift);
+    vacc3x0123 = vrshlq_s32(vacc3x0123, vright_shift);
+    vacc3x4567 = vrshlq_s32(vacc3x4567, vright_shift);
+    vacc3x89AB = vrshlq_s32(vacc3x89AB, vright_shift);
+    vacc3xCDEF = vrshlq_s32(vacc3xCDEF, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
+    const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), voutput_zero_point);
+    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+    int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
+    int8x16_t vout2x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc2x89ABCDEF);
+    int8x16_t vout3x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc3x01234567), vacc3x89ABCDEF);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+    const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567)), voutput_zero_point);
+    const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF)), voutput_zero_point);
+
+    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+    int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
+    int8x16_t vout2x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc2x89ABCDEF));
+    int8x16_t vout3x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc3x01234567), vqmovn_s16(vacc3x89ABCDEF));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout3x0123456789ABCDEF = vmaxq_s8(vout3x0123456789ABCDEF, voutput_min);
+    vout2x0123456789ABCDEF = vmaxq_s8(vout2x0123456789ABCDEF, voutput_min);
+    vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
+    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+
+    vout3x0123456789ABCDEF = vminq_s8(vout3x0123456789ABCDEF, voutput_max);
+    vout2x0123456789ABCDEF = vminq_s8(vout2x0123456789ABCDEF, voutput_max);
+    vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
+    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+
+    if (nc >= 16) {
+      vst1q_s8(c3 + 0, vout3x0123456789ABCDEF);
+      vst1q_s8(c2 + 0, vout2x0123456789ABCDEF);
+      vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
+      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+
+      c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 16;
+    } else {
+      int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
+      int8x16_t vout2x01234567_3x01234567 = vcombine_s8(vget_low_s8(vout2x0123456789ABCDEF), vget_low_s8(vout3x0123456789ABCDEF));
+      if (nc & 8) {
+        vst1_s8(c3, vget_high_s8(vout2x01234567_3x01234567)); c3 += 8;
+        vst1_s8(c2, vget_low_s8(vout2x01234567_3x01234567)); c2 += 8;
+        vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
+        vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
+        vout2x01234567_3x01234567 = vcombine_s8(vget_high_s8(vout2x0123456789ABCDEF), vget_high_s8(vout3x0123456789ABCDEF));
+        vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
+      }
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c3, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 2); c3 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 0); c2 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c3, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 4); c3 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 0); c2 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c3, vout2x01234567_3x01234567, 8);
+        vst1q_lane_s8(c2, vout2x01234567_3x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-sse2-ld128.c b/src/qs8-igemm/gen/4x4c2-minmax-sse2-ld128.c
index 89c7807..012a594 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-sse2-ld128.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-sse2-ld128.c
@@ -12,6 +12,7 @@
 #include <emmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_4x4c2__sse2_ld128(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
@@ -197,21 +199,6 @@
               _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
             vacc3x0123 = _mm_add_epi32(vacc3x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-            if (k > 6 * sizeof(int8_t)) {
-              const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-              w = (const void*) ((uintptr_t) w + 8);
-              const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-
-              vacc0x0123 = _mm_add_epi32(vacc0x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-              vacc1x0123 = _mm_add_epi32(vacc1x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-              vacc2x0123 = _mm_add_epi32(vacc2x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-              vacc3x0123 = _mm_add_epi32(vacc3x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            }
           }
         }
       }
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-sse2-ld64.c b/src/qs8-igemm/gen/4x4c2-minmax-sse2-ld64.c
index 7b5ce73..3d83c87 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-sse2-ld64.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-sse2-ld64.c
@@ -12,6 +12,7 @@
 #include <emmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_4x4c2__sse2_ld64(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
@@ -197,21 +199,6 @@
               _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
             vacc3x0123 = _mm_add_epi32(vacc3x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-            if (k > 6 * sizeof(int8_t)) {
-              const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-              w = (const void*) ((uintptr_t) w + 8);
-              const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-
-              vacc0x0123 = _mm_add_epi32(vacc0x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-              vacc1x0123 = _mm_add_epi32(vacc1x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-              vacc2x0123 = _mm_add_epi32(vacc2x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-              vacc3x0123 = _mm_add_epi32(vacc3x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            }
           }
         }
       }
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-sse41-ld128.c b/src/qs8-igemm/gen/4x4c2-minmax-sse41-ld128.c
index 6fc97cc..d0a7c2a 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-sse41-ld128.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-sse41-ld128.c
@@ -12,6 +12,7 @@
 #include <smmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
@@ -197,21 +199,6 @@
               _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
             vacc3x0123 = _mm_add_epi32(vacc3x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-            if (k > 6 * sizeof(int8_t)) {
-              const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-              w = (const void*) ((uintptr_t) w + 8);
-              const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-
-              vacc0x0123 = _mm_add_epi32(vacc0x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-              vacc1x0123 = _mm_add_epi32(vacc1x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-              vacc2x0123 = _mm_add_epi32(vacc2x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-              vacc3x0123 = _mm_add_epi32(vacc3x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            }
           }
         }
       }
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-sse41-ld64.c b/src/qs8-igemm/gen/4x4c2-minmax-sse41-ld64.c
index 4978b7b..9c34aa3 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-sse41-ld64.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-sse41-ld64.c
@@ -12,6 +12,7 @@
 #include <smmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
@@ -197,21 +199,6 @@
               _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
             vacc3x0123 = _mm_add_epi32(vacc3x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-            if (k > 6 * sizeof(int8_t)) {
-              const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-              w = (const void*) ((uintptr_t) w + 8);
-              const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-
-              vacc0x0123 = _mm_add_epi32(vacc0x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-              vacc1x0123 = _mm_add_epi32(vacc1x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-              vacc2x0123 = _mm_add_epi32(vacc2x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-              vacc3x0123 = _mm_add_epi32(vacc3x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            }
           }
         }
       }
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-ssse3-ld128.c b/src/qs8-igemm/gen/4x4c2-minmax-ssse3-ld128.c
index 82cfd1f..9c1304f 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-ssse3-ld128.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-ssse3-ld128.c
@@ -12,6 +12,7 @@
 #include <tmmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_4x4c2__ssse3_ld128(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
@@ -197,21 +199,6 @@
               _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
             vacc3x0123 = _mm_add_epi32(vacc3x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-            if (k > 6 * sizeof(int8_t)) {
-              const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-              w = (const void*) ((uintptr_t) w + 8);
-              const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-
-              vacc0x0123 = _mm_add_epi32(vacc0x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-              vacc1x0123 = _mm_add_epi32(vacc1x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-              vacc2x0123 = _mm_add_epi32(vacc2x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-              vacc3x0123 = _mm_add_epi32(vacc3x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            }
           }
         }
       }
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-ssse3-ld64.c b/src/qs8-igemm/gen/4x4c2-minmax-ssse3-ld64.c
index 8a2def1..2d56cdf 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-ssse3-ld64.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-ssse3-ld64.c
@@ -12,6 +12,7 @@
 #include <tmmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_4x4c2__ssse3_ld64(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
@@ -197,21 +199,6 @@
               _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
             vacc3x0123 = _mm_add_epi32(vacc3x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-            if (k > 6 * sizeof(int8_t)) {
-              const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-              w = (const void*) ((uintptr_t) w + 8);
-              const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-
-              vacc0x0123 = _mm_add_epi32(vacc0x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-              vacc1x0123 = _mm_add_epi32(vacc1x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-              vacc2x0123 = _mm_add_epi32(vacc2x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-              vacc3x0123 = _mm_add_epi32(vacc3x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            }
           }
         }
       }
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-xop-ld128.c b/src/qs8-igemm/gen/4x4c2-minmax-xop-ld128.c
index b1b6880..e6faba2 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-xop-ld128.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-xop-ld128.c
@@ -17,6 +17,7 @@
 #endif
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128(
@@ -44,6 +45,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
@@ -202,21 +204,6 @@
               _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc2x0123);
             vacc3x0123 = _mm_maddd_epi16(
               _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc3x0123);
-
-            if (k > 6 * sizeof(int8_t)) {
-              const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-              w = (const void*) ((uintptr_t) w + 8);
-              const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-
-              vacc0x0123 = _mm_maddd_epi16(
-                _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc0x0123);
-              vacc1x0123 = _mm_maddd_epi16(
-                _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc1x0123);
-              vacc2x0123 = _mm_maddd_epi16(
-                _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc2x0123);
-              vacc3x0123 = _mm_maddd_epi16(
-                _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc3x0123);
-            }
           }
         }
       }
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-xop-ld64.c b/src/qs8-igemm/gen/4x4c2-minmax-xop-ld64.c
index 7f3c778..0253dfa 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-xop-ld64.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-xop-ld64.c
@@ -17,6 +17,7 @@
 #endif
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64(
@@ -44,6 +45,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
@@ -202,21 +204,6 @@
               _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc2x0123);
             vacc3x0123 = _mm_maddd_epi16(
               _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc3x0123);
-
-            if (k > 6 * sizeof(int8_t)) {
-              const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-              w = (const void*) ((uintptr_t) w + 8);
-              const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-
-              vacc0x0123 = _mm_maddd_epi16(
-                _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc0x0123);
-              vacc1x0123 = _mm_maddd_epi16(
-                _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc1x0123);
-              vacc2x0123 = _mm_maddd_epi16(
-                _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc2x0123);
-              vacc3x0123 = _mm_maddd_epi16(
-                _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc3x0123);
-            }
           }
         }
       }
diff --git a/src/qs8-igemm/gen/4x8-minmax-neon-mlal-lane.c b/src/qs8-igemm/gen/4x8-minmax-neon-mlal-lane.c
new file mode 100644
index 0000000..2f2ba67
--- /dev/null
+++ b/src/qs8-igemm/gen/4x8-minmax-neon-mlal-lane.c
@@ -0,0 +1,397 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/neon-mlal-lane.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/igemm.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (4 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+  int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    c3 = c2;
+  }
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc1x0123 = vacc0x0123;
+    int32x4_t vacc1x4567 = vacc0x4567;
+    int32x4_t vacc2x0123 = vacc0x0123;
+    int32x4_t vacc2x4567 = vacc0x4567;
+    int32x4_t vacc3x0123 = vacc0x0123;
+    int32x4_t vacc3x4567 = vacc0x4567;
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      const int8_t* restrict a3 = a[3];
+      if XNN_UNPREDICTABLE(a3 != zero) {
+        a3 = (const int8_t*) ((uintptr_t) a3 + a_offset);
+      }
+      a += 4;
+
+      size_t k = kc;
+      while (k >= 8 * sizeof(int8_t)) {
+        const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+        const int16x8_t vxa0 = vmovl_s8(va0);
+        const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+        const int16x8_t vxa1 = vmovl_s8(va1);
+        const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+        const int16x8_t vxa2 = vmovl_s8(va2);
+        const int8x8_t va3 = vld1_s8(a3); a3 += 8;
+        const int16x8_t vxa3 = vmovl_s8(va3);
+
+        const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb01234567c0 = vmovl_s8(vb01234567c0);
+
+        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
+        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
+        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
+        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
+        vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0);
+        vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0);
+        const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb01234567c1 = vmovl_s8(vb01234567c1);
+
+        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
+        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
+        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
+        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
+        vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1);
+        vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1);
+        const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb01234567c2 = vmovl_s8(vb01234567c2);
+
+        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
+        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
+        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
+        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
+        vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2);
+        vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa3), 2);
+        const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb01234567c3 = vmovl_s8(vb01234567c3);
+
+        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
+        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
+        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
+        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
+        vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa3), 3);
+        vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa3), 3);
+
+        const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb01234567c4 = vmovl_s8(vb01234567c4);
+
+        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
+        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
+        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
+        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
+        vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa3), 0);
+        vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa3), 0);
+        const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb01234567c5 = vmovl_s8(vb01234567c5);
+
+        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
+        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
+        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
+        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
+        vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa3), 1);
+        vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa3), 1);
+        const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb01234567c6 = vmovl_s8(vb01234567c6);
+
+        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
+        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
+        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
+        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
+        vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa3), 2);
+        vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa3), 2);
+        const int8x8_t vb01234567c7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb01234567c7 = vmovl_s8(vb01234567c7);
+
+        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
+        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
+        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa1), 3);
+        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa1), 3);
+        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa2), 3);
+        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa2), 3);
+        vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa3), 3);
+        vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa3), 3);
+
+        k -= 8 * sizeof(int8_t);
+      }
+      if XNN_UNLIKELY(k != 0) {
+        const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+        const int16x8_t vxa0 = vmovl_s8(va0);
+        const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+        const int16x8_t vxa1 = vmovl_s8(va1);
+        const int8x8_t va2 = vld1_s8(a2); a2 = (const int8_t*) ((uintptr_t) a2 + k);
+        const int16x8_t vxa2 = vmovl_s8(va2);
+        const int8x8_t va3 = vld1_s8(a3); a3 = (const int8_t*) ((uintptr_t) a3 + k);
+        const int16x8_t vxa3 = vmovl_s8(va3);
+
+        const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vxb01234567c0 = vmovl_s8(vb01234567c0);
+
+        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
+        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
+        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
+        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
+        vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0);
+        vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0);
+
+        if (k >= 2 * sizeof(int8_t)) {
+          const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int16x8_t vxb01234567c1 = vmovl_s8(vb01234567c1);
+
+          vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+          vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+          vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
+          vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
+          vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
+          vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
+          vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1);
+          vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1);
+
+          if (k > 2 * sizeof(int8_t)) {
+            const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            const int16x8_t vxb01234567c2 = vmovl_s8(vb01234567c2);
+
+            vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+            vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+            vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
+            vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
+            vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
+            vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
+            vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2);
+            vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa3), 2);
+
+            if (k >= 4 * sizeof(int8_t)) {
+              const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+              const int16x8_t vxb01234567c3 = vmovl_s8(vb01234567c3);
+
+              vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+              vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+              vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
+              vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
+              vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
+              vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
+              vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa3), 3);
+              vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa3), 3);
+
+              if (k > 4 * sizeof(int8_t)) {
+                const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+                const int16x8_t vxb01234567c4 = vmovl_s8(vb01234567c4);
+
+                vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+                vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+                vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
+                vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
+                vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
+                vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
+                vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa3), 0);
+                vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa3), 0);
+
+                if (k >= 6 * sizeof(int8_t)) {
+                  const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+                  const int16x8_t vxb01234567c5 = vmovl_s8(vb01234567c5);
+
+                  vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+                  vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+                  vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
+                  vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
+                  vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
+                  vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
+                  vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa3), 1);
+                  vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa3), 1);
+
+                  if (k > 6 * sizeof(int8_t)) {
+                    const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+                    const int16x8_t vxb01234567c6 = vmovl_s8(vb01234567c6);
+
+                    vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+                    vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+                    vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
+                    vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
+                    vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
+                    vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
+                    vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa3), 2);
+                    vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa3), 2);
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+      p -= 4 * sizeof(void*);
+    } while (p != 0);
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+    vacc3x0123 = vqrdmulhq_s32(vacc3x0123, vmultiplier);
+    vacc3x4567 = vqrdmulhq_s32(vacc3x4567, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+    vacc3x0123 = vsraq_n_s32(vacc3x0123, vbicq_s32(vacc3x0123, vzero_shift_mask), 31);
+    vacc3x4567 = vsraq_n_s32(vacc3x4567, vbicq_s32(vacc3x4567, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+    vacc3x0123 = vrshlq_s32(vacc3x0123, vright_shift);
+    vacc3x4567 = vrshlq_s32(vacc3x4567, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
+    int8x16_t vout2x01234567_3x01234567 = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc3x01234567);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567)), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
+    int8x16_t vout2x01234567_3x01234567 = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc3x01234567));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout2x01234567_3x01234567 = vmaxq_s8(vout2x01234567_3x01234567, voutput_min);
+    vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);
+
+    vout2x01234567_3x01234567 = vminq_s8(vout2x01234567_3x01234567, voutput_max);
+    vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);
+
+    if (nc >= 8) {
+      vst1_s8(c3 + 0, vget_high_s8(vout2x01234567_3x01234567));
+      vst1_s8(c2 + 0, vget_low_s8(vout2x01234567_3x01234567));
+      vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
+      vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
+
+      c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c3, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 2); c3 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 0); c2 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c3, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 4); c3 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 0); c2 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c3, vout2x01234567_3x01234567, 8);
+        vst1q_lane_s8(c2, vout2x01234567_3x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/4x8-minmax-neon-mull-addw-dup.c b/src/qs8-igemm/gen/4x8-minmax-neon-mull-addw-dup.c
new file mode 100644
index 0000000..abe1d12
--- /dev/null
+++ b/src/qs8-igemm/gen/4x8-minmax-neon-mull-addw-dup.c
@@ -0,0 +1,433 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/neon-mull-addw-dup.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/gemm.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (4 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+  int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    c3 = c2;
+  }
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc1x0123 = vacc0x0123;
+    int32x4_t vacc1x4567 = vacc0x4567;
+    int32x4_t vacc2x0123 = vacc0x0123;
+    int32x4_t vacc2x4567 = vacc0x4567;
+    int32x4_t vacc3x0123 = vacc0x0123;
+    int32x4_t vacc3x4567 = vacc0x4567;
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      const int8_t* restrict a3 = a[3];
+      if XNN_UNPREDICTABLE(a3 != zero) {
+        a3 = (const int8_t*) ((uintptr_t) a3 + a_offset);
+      }
+      a += 4;
+
+      size_t k = kc;
+      while (k >= 8 * sizeof(int8_t)) {
+        const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+        const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+        const int8x8_t va3 = vld1_s8(a3); a3 += 8;
+
+        const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c0));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c0));
+        const int16x8_t vprod1x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va1, 0));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c0));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c0));
+        const int16x8_t vprod2x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va2, 0));
+        vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c0));
+        vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c0));
+        const int16x8_t vprod3x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va3, 0));
+        vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c0));
+        vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c0));
+        const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c1));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c1));
+        const int16x8_t vprod1x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va1, 1));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c1));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c1));
+        const int16x8_t vprod2x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va2, 1));
+        vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c1));
+        vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c1));
+        const int16x8_t vprod3x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va3, 1));
+        vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c1));
+        vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c1));
+        const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va0, 2));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c2));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c2));
+        const int16x8_t vprod1x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va1, 2));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c2));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c2));
+        const int16x8_t vprod2x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va2, 2));
+        vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c2));
+        vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c2));
+        const int16x8_t vprod3x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va3, 2));
+        vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c2));
+        vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c2));
+        const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3));
+        const int16x8_t vprod1x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va1, 3));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c3));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c3));
+        const int16x8_t vprod2x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va2, 3));
+        vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c3));
+        vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c3));
+        const int16x8_t vprod3x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va3, 3));
+        vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c3));
+        vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c3));
+        const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va0, 4));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c4));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c4));
+        const int16x8_t vprod1x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va1, 4));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c4));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c4));
+        const int16x8_t vprod2x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va2, 4));
+        vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c4));
+        vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c4));
+        const int16x8_t vprod3x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va3, 4));
+        vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c4));
+        vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c4));
+        const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va0, 5));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c5));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c5));
+        const int16x8_t vprod1x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va1, 5));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c5));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c5));
+        const int16x8_t vprod2x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va2, 5));
+        vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c5));
+        vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c5));
+        const int16x8_t vprod3x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va3, 5));
+        vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c5));
+        vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c5));
+        const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va0, 6));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c6));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c6));
+        const int16x8_t vprod1x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va1, 6));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c6));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c6));
+        const int16x8_t vprod2x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va2, 6));
+        vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c6));
+        vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c6));
+        const int16x8_t vprod3x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va3, 6));
+        vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c6));
+        vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c6));
+        const int8x8_t vb01234567c7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va0, 7));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c7));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c7));
+        const int16x8_t vprod1x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va1, 7));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c7));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c7));
+        const int16x8_t vprod2x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va2, 7));
+        vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c7));
+        vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c7));
+        const int16x8_t vprod3x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va3, 7));
+        vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c7));
+        vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c7));
+
+        k -= 8 * sizeof(int8_t);
+      }
+      if XNN_UNLIKELY(k != 0) {
+        const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+        const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+        const int8x8_t va2 = vld1_s8(a2); a2 = (const int8_t*) ((uintptr_t) a2 + k);
+        const int8x8_t va3 = vld1_s8(a3); a3 = (const int8_t*) ((uintptr_t) a3 + k);
+
+        const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0));
+        vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c0));
+        vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c0));
+        const int16x8_t vprod1x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va1, 0));
+        vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c0));
+        vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c0));
+        const int16x8_t vprod2x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va2, 0));
+        vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c0));
+        vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c0));
+        const int16x8_t vprod3x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va3, 0));
+        vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c0));
+        vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c0));
+
+        if (k >= 2 * sizeof(int8_t)) {
+          const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1));
+          vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c1));
+          vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c1));
+          const int16x8_t vprod1x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va1, 1));
+          vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c1));
+          vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c1));
+          const int16x8_t vprod2x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va2, 1));
+          vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c1));
+          vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c1));
+          const int16x8_t vprod3x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va3, 1));
+          vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c1));
+          vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c1));
+
+          if (k > 2 * sizeof(int8_t)) {
+            const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+            const int16x8_t vprod0x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va0, 2));
+            vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c2));
+            vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c2));
+            const int16x8_t vprod1x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va1, 2));
+            vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c2));
+            vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c2));
+            const int16x8_t vprod2x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va2, 2));
+            vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c2));
+            vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c2));
+            const int16x8_t vprod3x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va3, 2));
+            vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c2));
+            vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c2));
+
+            if (k >= 4 * sizeof(int8_t)) {
+              const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+              const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3));
+              vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3));
+              vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3));
+              const int16x8_t vprod1x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va1, 3));
+              vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c3));
+              vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c3));
+              const int16x8_t vprod2x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va2, 3));
+              vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c3));
+              vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c3));
+              const int16x8_t vprod3x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va3, 3));
+              vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c3));
+              vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c3));
+
+              if (k > 4 * sizeof(int8_t)) {
+                const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+                const int16x8_t vprod0x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va0, 4));
+                vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c4));
+                vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c4));
+                const int16x8_t vprod1x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va1, 4));
+                vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c4));
+                vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c4));
+                const int16x8_t vprod2x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va2, 4));
+                vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c4));
+                vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c4));
+                const int16x8_t vprod3x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va3, 4));
+                vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c4));
+                vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c4));
+
+                if (k >= 6 * sizeof(int8_t)) {
+                  const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+                  const int16x8_t vprod0x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va0, 5));
+                  vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c5));
+                  vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c5));
+                  const int16x8_t vprod1x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va1, 5));
+                  vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c5));
+                  vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c5));
+                  const int16x8_t vprod2x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va2, 5));
+                  vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c5));
+                  vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c5));
+                  const int16x8_t vprod3x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va3, 5));
+                  vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c5));
+                  vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c5));
+
+                  if (k > 6 * sizeof(int8_t)) {
+                    const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+                    const int16x8_t vprod0x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va0, 6));
+                    vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c6));
+                    vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c6));
+                    const int16x8_t vprod1x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va1, 6));
+                    vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c6));
+                    vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c6));
+                    const int16x8_t vprod2x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va2, 6));
+                    vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c6));
+                    vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c6));
+                    const int16x8_t vprod3x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va3, 6));
+                    vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c6));
+                    vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c6));
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+      p -= 4 * sizeof(void*);
+    } while (p != 0);
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+    vacc3x0123 = vqrdmulhq_s32(vacc3x0123, vmultiplier);
+    vacc3x4567 = vqrdmulhq_s32(vacc3x4567, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+    vacc3x0123 = vsraq_n_s32(vacc3x0123, vbicq_s32(vacc3x0123, vzero_shift_mask), 31);
+    vacc3x4567 = vsraq_n_s32(vacc3x4567, vbicq_s32(vacc3x4567, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+    vacc3x0123 = vrshlq_s32(vacc3x0123, vright_shift);
+    vacc3x4567 = vrshlq_s32(vacc3x4567, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
+    int8x16_t vout2x01234567_3x01234567 = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc3x01234567);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567)), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
+    int8x16_t vout2x01234567_3x01234567 = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc3x01234567));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout2x01234567_3x01234567 = vmaxq_s8(vout2x01234567_3x01234567, voutput_min);
+    vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);
+
+    vout2x01234567_3x01234567 = vminq_s8(vout2x01234567_3x01234567, voutput_max);
+    vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);
+
+    if (nc >= 8) {
+      vst1_s8(c3 + 0, vget_high_s8(vout2x01234567_3x01234567));
+      vst1_s8(c2 + 0, vget_low_s8(vout2x01234567_3x01234567));
+      vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
+      vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
+
+      c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c3, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 2); c3 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 0); c2 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c3, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 4); c3 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 0); c2 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c3, vout2x01234567_3x01234567, 8);
+        vst1q_lane_s8(c2, vout2x01234567_3x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/4x8c16-minmax-neon-mlal-padal.c b/src/qs8-igemm/gen/4x8c16-minmax-neon-mlal-padal.c
new file mode 100644
index 0000000..ef871fb
--- /dev/null
+++ b/src/qs8-igemm/gen/4x8c16-minmax-neon-mlal-padal.c
@@ -0,0 +1,413 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/c16-neon-mlal-padal.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (4 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 16);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+  int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    c3 = c2;
+  }
+
+  do {
+    int32x4_t vacc0x0 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x1 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x2 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x3 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x4 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x5 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc1x0 = vacc0x0;
+    int32x4_t vacc1x1 = vacc0x1;
+    int32x4_t vacc1x2 = vacc0x2;
+    int32x4_t vacc1x3 = vacc0x3;
+    int32x4_t vacc1x4 = vacc0x4;
+    int32x4_t vacc1x5 = vacc0x5;
+    int32x4_t vacc1x6 = vacc0x6;
+    int32x4_t vacc1x7 = vacc0x7;
+    int32x4_t vacc2x0 = vacc0x0;
+    int32x4_t vacc2x1 = vacc0x1;
+    int32x4_t vacc2x2 = vacc0x2;
+    int32x4_t vacc2x3 = vacc0x3;
+    int32x4_t vacc2x4 = vacc0x4;
+    int32x4_t vacc2x5 = vacc0x5;
+    int32x4_t vacc2x6 = vacc0x6;
+    int32x4_t vacc2x7 = vacc0x7;
+    int32x4_t vacc3x0 = vacc0x0;
+    int32x4_t vacc3x1 = vacc0x1;
+    int32x4_t vacc3x2 = vacc0x2;
+    int32x4_t vacc3x3 = vacc0x3;
+    int32x4_t vacc3x4 = vacc0x4;
+    int32x4_t vacc3x5 = vacc0x5;
+    int32x4_t vacc3x6 = vacc0x6;
+    int32x4_t vacc3x7 = vacc0x7;
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      const int8_t* restrict a3 = a[3];
+      if XNN_UNPREDICTABLE(a3 != zero) {
+        a3 = (const int8_t*) ((uintptr_t) a3 + a_offset);
+      }
+      a += 4;
+
+      // KC loop of 16 with up to 15 remainder
+      size_t k = 0;
+      while (k < kc) {
+        const int8x16_t va0 = vld1q_s8(a0); a0 += 16;
+        const int8x16_t va1 = vld1q_s8(a1); a1 += 16;
+        const int8x16_t va2 = vld1q_s8(a2); a2 += 16;
+        const int8x16_t va3 = vld1q_s8(a3); a3 += 16;
+
+        const int8x16_t vb0 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb1 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb2 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb3 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb4 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb5 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb6 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+        const int8x16_t vb7 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t));
+
+        int16x8_t vprod0x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va0));
+        int16x8_t vprod1x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va1));
+        int16x8_t vprod2x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va2));
+        int16x8_t vprod3x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va3));
+        vprod0x0 = vmlal_s8(vprod0x0, vget_high_s8(vb0), vget_high_s8(va0));
+        vprod1x0 = vmlal_s8(vprod1x0, vget_high_s8(vb0), vget_high_s8(va1));
+        vprod2x0 = vmlal_s8(vprod2x0, vget_high_s8(vb0), vget_high_s8(va2));
+        vprod3x0 = vmlal_s8(vprod3x0, vget_high_s8(vb0), vget_high_s8(va3));
+        vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+        vacc1x0 = vpadalq_s16(vacc1x0, vprod1x0);
+        vacc2x0 = vpadalq_s16(vacc2x0, vprod2x0);
+        vacc3x0 = vpadalq_s16(vacc3x0, vprod3x0);
+        int16x8_t vprod0x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va0));
+        int16x8_t vprod1x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va1));
+        int16x8_t vprod2x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va2));
+        int16x8_t vprod3x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va3));
+        vprod0x1 = vmlal_s8(vprod0x1, vget_high_s8(vb1), vget_high_s8(va0));
+        vprod1x1 = vmlal_s8(vprod1x1, vget_high_s8(vb1), vget_high_s8(va1));
+        vprod2x1 = vmlal_s8(vprod2x1, vget_high_s8(vb1), vget_high_s8(va2));
+        vprod3x1 = vmlal_s8(vprod3x1, vget_high_s8(vb1), vget_high_s8(va3));
+        vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+        vacc1x1 = vpadalq_s16(vacc1x1, vprod1x1);
+        vacc2x1 = vpadalq_s16(vacc2x1, vprod2x1);
+        vacc3x1 = vpadalq_s16(vacc3x1, vprod3x1);
+        int16x8_t vprod0x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va0));
+        int16x8_t vprod1x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va1));
+        int16x8_t vprod2x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va2));
+        int16x8_t vprod3x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va3));
+        vprod0x2 = vmlal_s8(vprod0x2, vget_high_s8(vb2), vget_high_s8(va0));
+        vprod1x2 = vmlal_s8(vprod1x2, vget_high_s8(vb2), vget_high_s8(va1));
+        vprod2x2 = vmlal_s8(vprod2x2, vget_high_s8(vb2), vget_high_s8(va2));
+        vprod3x2 = vmlal_s8(vprod3x2, vget_high_s8(vb2), vget_high_s8(va3));
+        vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+        vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2);
+        vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2);
+        vacc3x2 = vpadalq_s16(vacc3x2, vprod3x2);
+        int16x8_t vprod0x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va0));
+        int16x8_t vprod1x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va1));
+        int16x8_t vprod2x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va2));
+        int16x8_t vprod3x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va3));
+        vprod0x3 = vmlal_s8(vprod0x3, vget_high_s8(vb3), vget_high_s8(va0));
+        vprod1x3 = vmlal_s8(vprod1x3, vget_high_s8(vb3), vget_high_s8(va1));
+        vprod2x3 = vmlal_s8(vprod2x3, vget_high_s8(vb3), vget_high_s8(va2));
+        vprod3x3 = vmlal_s8(vprod3x3, vget_high_s8(vb3), vget_high_s8(va3));
+        vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+        vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3);
+        vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3);
+        vacc3x3 = vpadalq_s16(vacc3x3, vprod3x3);
+        int16x8_t vprod0x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va0));
+        int16x8_t vprod1x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va1));
+        int16x8_t vprod2x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va2));
+        int16x8_t vprod3x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va3));
+        vprod0x4 = vmlal_s8(vprod0x4, vget_high_s8(vb4), vget_high_s8(va0));
+        vprod1x4 = vmlal_s8(vprod1x4, vget_high_s8(vb4), vget_high_s8(va1));
+        vprod2x4 = vmlal_s8(vprod2x4, vget_high_s8(vb4), vget_high_s8(va2));
+        vprod3x4 = vmlal_s8(vprod3x4, vget_high_s8(vb4), vget_high_s8(va3));
+        vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+        vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4);
+        vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4);
+        vacc3x4 = vpadalq_s16(vacc3x4, vprod3x4);
+        int16x8_t vprod0x5 = vmull_s8(vget_low_s8(vb5), vget_low_s8(va0));
+        int16x8_t vprod1x5 = vmull_s8(vget_low_s8(vb5), vget_low_s8(va1));
+        int16x8_t vprod2x5 = vmull_s8(vget_low_s8(vb5), vget_low_s8(va2));
+        int16x8_t vprod3x5 = vmull_s8(vget_low_s8(vb5), vget_low_s8(va3));
+        vprod0x5 = vmlal_s8(vprod0x5, vget_high_s8(vb5), vget_high_s8(va0));
+        vprod1x5 = vmlal_s8(vprod1x5, vget_high_s8(vb5), vget_high_s8(va1));
+        vprod2x5 = vmlal_s8(vprod2x5, vget_high_s8(vb5), vget_high_s8(va2));
+        vprod3x5 = vmlal_s8(vprod3x5, vget_high_s8(vb5), vget_high_s8(va3));
+        vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+        vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5);
+        vacc2x5 = vpadalq_s16(vacc2x5, vprod2x5);
+        vacc3x5 = vpadalq_s16(vacc3x5, vprod3x5);
+        int16x8_t vprod0x6 = vmull_s8(vget_low_s8(vb6), vget_low_s8(va0));
+        int16x8_t vprod1x6 = vmull_s8(vget_low_s8(vb6), vget_low_s8(va1));
+        int16x8_t vprod2x6 = vmull_s8(vget_low_s8(vb6), vget_low_s8(va2));
+        int16x8_t vprod3x6 = vmull_s8(vget_low_s8(vb6), vget_low_s8(va3));
+        vprod0x6 = vmlal_s8(vprod0x6, vget_high_s8(vb6), vget_high_s8(va0));
+        vprod1x6 = vmlal_s8(vprod1x6, vget_high_s8(vb6), vget_high_s8(va1));
+        vprod2x6 = vmlal_s8(vprod2x6, vget_high_s8(vb6), vget_high_s8(va2));
+        vprod3x6 = vmlal_s8(vprod3x6, vget_high_s8(vb6), vget_high_s8(va3));
+        vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+        vacc1x6 = vpadalq_s16(vacc1x6, vprod1x6);
+        vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6);
+        vacc3x6 = vpadalq_s16(vacc3x6, vprod3x6);
+        int16x8_t vprod0x7 = vmull_s8(vget_low_s8(vb7), vget_low_s8(va0));
+        int16x8_t vprod1x7 = vmull_s8(vget_low_s8(vb7), vget_low_s8(va1));
+        int16x8_t vprod2x7 = vmull_s8(vget_low_s8(vb7), vget_low_s8(va2));
+        int16x8_t vprod3x7 = vmull_s8(vget_low_s8(vb7), vget_low_s8(va3));
+        vprod0x7 = vmlal_s8(vprod0x7, vget_high_s8(vb7), vget_high_s8(va0));
+        vprod1x7 = vmlal_s8(vprod1x7, vget_high_s8(vb7), vget_high_s8(va1));
+        vprod2x7 = vmlal_s8(vprod2x7, vget_high_s8(vb7), vget_high_s8(va2));
+        vprod3x7 = vmlal_s8(vprod3x7, vget_high_s8(vb7), vget_high_s8(va3));
+        vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+        vacc1x7 = vpadalq_s16(vacc1x7, vprod1x7);
+        vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7);
+        vacc3x7 = vpadalq_s16(vacc3x7, vprod3x7);
+
+        k += 16 * sizeof(int8_t);
+      }
+
+      p -= 4 * sizeof(void*);
+    } while (p != 0);
+
+#if XNN_ARCH_ARM64
+    const int32x4_t vsum0x01 = vpaddq_s32(vacc0x0, vacc0x1);
+    const int32x4_t vsum0x23 = vpaddq_s32(vacc0x2, vacc0x3);
+    const int32x4_t vsum0x45 = vpaddq_s32(vacc0x4, vacc0x5);
+    const int32x4_t vsum0x67 = vpaddq_s32(vacc0x6, vacc0x7);
+    const int32x4_t vsum1x01 = vpaddq_s32(vacc1x0, vacc1x1);
+    const int32x4_t vsum1x23 = vpaddq_s32(vacc1x2, vacc1x3);
+    const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5);
+    const int32x4_t vsum1x67 = vpaddq_s32(vacc1x6, vacc1x7);
+    const int32x4_t vsum2x01 = vpaddq_s32(vacc2x0, vacc2x1);
+    const int32x4_t vsum2x23 = vpaddq_s32(vacc2x2, vacc2x3);
+    const int32x4_t vsum2x45 = vpaddq_s32(vacc2x4, vacc2x5);
+    const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7);
+    const int32x4_t vsum3x01 = vpaddq_s32(vacc3x0, vacc3x1);
+    const int32x4_t vsum3x23 = vpaddq_s32(vacc3x2, vacc3x3);
+    const int32x4_t vsum3x45 = vpaddq_s32(vacc3x4, vacc3x5);
+    const int32x4_t vsum3x67 = vpaddq_s32(vacc3x6, vacc3x7);
+    int32x4_t vacc0x0123 = vpaddq_s32(vsum0x01, vsum0x23);
+    int32x4_t vacc0x4567 = vpaddq_s32(vsum0x45, vsum0x67);
+    int32x4_t vacc1x0123 = vpaddq_s32(vsum1x01, vsum1x23);
+    int32x4_t vacc1x4567 = vpaddq_s32(vsum1x45, vsum1x67);
+    int32x4_t vacc2x0123 = vpaddq_s32(vsum2x01, vsum2x23);
+    int32x4_t vacc2x4567 = vpaddq_s32(vsum2x45, vsum2x67);
+    int32x4_t vacc3x0123 = vpaddq_s32(vsum3x01, vsum3x23);
+    int32x4_t vacc3x4567 = vpaddq_s32(vsum3x45, vsum3x67);
+#else
+    const int32x2_t vpsum0x0 = vadd_s32(vget_low_s32(vacc0x0), vget_high_s32(vacc0x0));
+    const int32x2_t vpsum0x1 = vadd_s32(vget_low_s32(vacc0x1), vget_high_s32(vacc0x1));
+    const int32x2_t vpsum0x2 = vadd_s32(vget_low_s32(vacc0x2), vget_high_s32(vacc0x2));
+    const int32x2_t vpsum0x3 = vadd_s32(vget_low_s32(vacc0x3), vget_high_s32(vacc0x3));
+    const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1);
+    const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3);
+    int32x4_t vacc0x0123 = vcombine_s32(vsum0x01, vsum0x23 );
+    const int32x2_t vpsum0x4 = vadd_s32(vget_low_s32(vacc0x4), vget_high_s32(vacc0x4));
+    const int32x2_t vpsum0x5 = vadd_s32(vget_low_s32(vacc0x5), vget_high_s32(vacc0x5));
+    const int32x2_t vpsum0x6 = vadd_s32(vget_low_s32(vacc0x6), vget_high_s32(vacc0x6));
+    const int32x2_t vpsum0x7 = vadd_s32(vget_low_s32(vacc0x7), vget_high_s32(vacc0x7));
+    const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5);
+    const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7);
+    int32x4_t vacc0x4567 = vcombine_s32(vsum0x45, vsum0x67 );
+    const int32x2_t vpsum1x0 = vadd_s32(vget_low_s32(vacc1x0), vget_high_s32(vacc1x0));
+    const int32x2_t vpsum1x1 = vadd_s32(vget_low_s32(vacc1x1), vget_high_s32(vacc1x1));
+    const int32x2_t vpsum1x2 = vadd_s32(vget_low_s32(vacc1x2), vget_high_s32(vacc1x2));
+    const int32x2_t vpsum1x3 = vadd_s32(vget_low_s32(vacc1x3), vget_high_s32(vacc1x3));
+    const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1);
+    const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3);
+    int32x4_t vacc1x0123 = vcombine_s32(vsum1x01, vsum1x23 );
+    const int32x2_t vpsum1x4 = vadd_s32(vget_low_s32(vacc1x4), vget_high_s32(vacc1x4));
+    const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5));
+    const int32x2_t vpsum1x6 = vadd_s32(vget_low_s32(vacc1x6), vget_high_s32(vacc1x6));
+    const int32x2_t vpsum1x7 = vadd_s32(vget_low_s32(vacc1x7), vget_high_s32(vacc1x7));
+    const int32x2_t vsum1x45 = vpadd_s32(vpsum1x4, vpsum1x5);
+    const int32x2_t vsum1x67 = vpadd_s32(vpsum1x6, vpsum1x7);
+    int32x4_t vacc1x4567 = vcombine_s32(vsum1x45, vsum1x67 );
+    const int32x2_t vpsum2x0 = vadd_s32(vget_low_s32(vacc2x0), vget_high_s32(vacc2x0));
+    const int32x2_t vpsum2x1 = vadd_s32(vget_low_s32(vacc2x1), vget_high_s32(vacc2x1));
+    const int32x2_t vpsum2x2 = vadd_s32(vget_low_s32(vacc2x2), vget_high_s32(vacc2x2));
+    const int32x2_t vpsum2x3 = vadd_s32(vget_low_s32(vacc2x3), vget_high_s32(vacc2x3));
+    const int32x2_t vsum2x01 = vpadd_s32(vpsum2x0, vpsum2x1);
+    const int32x2_t vsum2x23 = vpadd_s32(vpsum2x2, vpsum2x3);
+    int32x4_t vacc2x0123 = vcombine_s32(vsum2x01, vsum2x23 );
+    const int32x2_t vpsum2x4 = vadd_s32(vget_low_s32(vacc2x4), vget_high_s32(vacc2x4));
+    const int32x2_t vpsum2x5 = vadd_s32(vget_low_s32(vacc2x5), vget_high_s32(vacc2x5));
+    const int32x2_t vpsum2x6 = vadd_s32(vget_low_s32(vacc2x6), vget_high_s32(vacc2x6));
+    const int32x2_t vpsum2x7 = vadd_s32(vget_low_s32(vacc2x7), vget_high_s32(vacc2x7));
+    const int32x2_t vsum2x45 = vpadd_s32(vpsum2x4, vpsum2x5);
+    const int32x2_t vsum2x67 = vpadd_s32(vpsum2x6, vpsum2x7);
+    int32x4_t vacc2x4567 = vcombine_s32(vsum2x45, vsum2x67 );
+    const int32x2_t vpsum3x0 = vadd_s32(vget_low_s32(vacc3x0), vget_high_s32(vacc3x0));
+    const int32x2_t vpsum3x1 = vadd_s32(vget_low_s32(vacc3x1), vget_high_s32(vacc3x1));
+    const int32x2_t vpsum3x2 = vadd_s32(vget_low_s32(vacc3x2), vget_high_s32(vacc3x2));
+    const int32x2_t vpsum3x3 = vadd_s32(vget_low_s32(vacc3x3), vget_high_s32(vacc3x3));
+    const int32x2_t vsum3x01 = vpadd_s32(vpsum3x0, vpsum3x1);
+    const int32x2_t vsum3x23 = vpadd_s32(vpsum3x2, vpsum3x3);
+    int32x4_t vacc3x0123 = vcombine_s32(vsum3x01, vsum3x23 );
+    const int32x2_t vpsum3x4 = vadd_s32(vget_low_s32(vacc3x4), vget_high_s32(vacc3x4));
+    const int32x2_t vpsum3x5 = vadd_s32(vget_low_s32(vacc3x5), vget_high_s32(vacc3x5));
+    const int32x2_t vpsum3x6 = vadd_s32(vget_low_s32(vacc3x6), vget_high_s32(vacc3x6));
+    const int32x2_t vpsum3x7 = vadd_s32(vget_low_s32(vacc3x7), vget_high_s32(vacc3x7));
+    const int32x2_t vsum3x45 = vpadd_s32(vpsum3x4, vpsum3x5);
+    const int32x2_t vsum3x67 = vpadd_s32(vpsum3x6, vpsum3x7);
+    int32x4_t vacc3x4567 = vcombine_s32(vsum3x45, vsum3x67 );
+#endif
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+    vacc3x0123 = vqrdmulhq_s32(vacc3x0123, vmultiplier);
+    vacc3x4567 = vqrdmulhq_s32(vacc3x4567, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+    vacc3x0123 = vsraq_n_s32(vacc3x0123, vbicq_s32(vacc3x0123, vzero_shift_mask), 31);
+    vacc3x4567 = vsraq_n_s32(vacc3x4567, vbicq_s32(vacc3x4567, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+    vacc3x0123 = vrshlq_s32(vacc3x0123, vright_shift);
+    vacc3x4567 = vrshlq_s32(vacc3x4567, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
+    int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
+    int8x16_t vout2x01234567_3x01234567 = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc3x01234567);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567)), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
+    int8x16_t vout2x01234567_3x01234567 = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc3x01234567));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout2x01234567_3x01234567 = vmaxq_s8(vout2x01234567_3x01234567, voutput_min);
+    vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);
+
+    vout2x01234567_3x01234567 = vminq_s8(vout2x01234567_3x01234567, voutput_max);
+    vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);
+
+    if (nc >= 8) {
+      vst1_s8(c3 + 0, vget_high_s8(vout2x01234567_3x01234567));
+      vst1_s8(c2 + 0, vget_low_s8(vout2x01234567_3x01234567));
+      vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
+      vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
+
+      c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c3, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 2); c3 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 0); c2 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c3, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 4); c3 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 0); c2 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c3, vout2x01234567_3x01234567, 8);
+        vst1q_lane_s8(c2, vout2x01234567_3x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/4x8c2-minmax-neon-mlal-padal-dup.c b/src/qs8-igemm/gen/4x8c2-minmax-neon-mlal-padal-dup.c
new file mode 100644
index 0000000..80aa9ac
--- /dev/null
+++ b/src/qs8-igemm/gen/4x8c2-minmax-neon-mlal-padal-dup.c
@@ -0,0 +1,471 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/c2-neon-mull-padal-dup.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (4 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+  int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    c3 = c2;
+  }
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc1x0123 = vacc0x0123;
+    int32x4_t vacc1x4567 = vacc0x4567;
+    int32x4_t vacc2x0123 = vacc0x0123;
+    int32x4_t vacc2x4567 = vacc0x4567;
+    int32x4_t vacc3x0123 = vacc0x0123;
+    int32x4_t vacc3x4567 = vacc0x4567;
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      const int8_t* restrict a3 = a[3];
+      if XNN_UNPREDICTABLE(a3 != zero) {
+        a3 = (const int8_t*) ((uintptr_t) a3 + a_offset);
+      }
+      a += 4;
+
+      size_t k = kc;
+
+      while (k >= 16 * sizeof(int8_t)) {
+        const int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va0x1 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va1x0 = vld1_s8(a1); a1 += 8;
+        const int8x8_t va1x1 = vld1_s8(a1); a1 += 8;
+        const int8x8_t va2x0 = vld1_s8(a2); a2 += 8;
+        const int8x8_t va2x1 = vld1_s8(a2); a2 += 8;
+        const int8x8_t va3x0 = vld1_s8(a3); a3 += 8;
+        const int8x8_t va3x1 = vld1_s8(a3); a3 += 8;
+
+        const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+        int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+        int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0)));
+        int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 0)));
+        const int8x8_t vb0123c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+        vprod1x0123c0 = vmlal_s8(vprod1x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+        vprod2x0123c0 = vmlal_s8(vprod2x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 0)));
+        vprod3x0123c0 = vmlal_s8(vprod3x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 0)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+        vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c0);
+        int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+        int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+        int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0)));
+        int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 0)));
+        const int8x8_t vb4567c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+        vprod1x4567c0 = vmlal_s8(vprod1x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+        vprod2x4567c0 = vmlal_s8(vprod2x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 0)));
+        vprod3x4567c0 = vmlal_s8(vprod3x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 0)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+        vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c0);
+        int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+        int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+        int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1)));
+        int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 1)));
+        const int8x8_t vb0123c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x0123c1 = vmlal_s8(vprod0x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+        vprod1x0123c1 = vmlal_s8(vprod1x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+        vprod2x0123c1 = vmlal_s8(vprod2x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 1)));
+        vprod3x0123c1 = vmlal_s8(vprod3x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 1)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+        vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c1);
+        int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+        int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+        int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1)));
+        int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 1)));
+        const int8x8_t vb4567c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x4567c1 = vmlal_s8(vprod0x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+        vprod1x4567c1 = vmlal_s8(vprod1x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+        vprod2x4567c1 = vmlal_s8(vprod2x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 1)));
+        vprod3x4567c1 = vmlal_s8(vprod3x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 1)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+        vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c1);
+        int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+        int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+        int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 2)));
+        int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 2)));
+        const int8x8_t vb0123c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x0123c2 = vmlal_s8(vprod0x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+        vprod1x0123c2 = vmlal_s8(vprod1x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+        vprod2x0123c2 = vmlal_s8(vprod2x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 2)));
+        vprod3x0123c2 = vmlal_s8(vprod3x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 2)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+        vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c2);
+        int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+        int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+        int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 2)));
+        int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 2)));
+        const int8x8_t vb4567c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x4567c2 = vmlal_s8(vprod0x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+        vprod1x4567c2 = vmlal_s8(vprod1x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+        vprod2x4567c2 = vmlal_s8(vprod2x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 2)));
+        vprod3x4567c2 = vmlal_s8(vprod3x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 2)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+        vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c2);
+        int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+        int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+        int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 3)));
+        int16x8_t vprod3x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 3)));
+        const int8x8_t vb0123c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x0123c3 = vmlal_s8(vprod0x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+        vprod1x0123c3 = vmlal_s8(vprod1x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+        vprod2x0123c3 = vmlal_s8(vprod2x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 3)));
+        vprod3x0123c3 = vmlal_s8(vprod3x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 3)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
+        vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c3);
+        int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+        int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+        int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 3)));
+        int16x8_t vprod3x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 3)));
+        const int8x8_t vb4567c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        vprod0x4567c3 = vmlal_s8(vprod0x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+        vprod1x4567c3 = vmlal_s8(vprod1x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+        vprod2x4567c3 = vmlal_s8(vprod2x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 3)));
+        vprod3x4567c3 = vmlal_s8(vprod3x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 3)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
+        vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c3);
+
+        k -= 16 * sizeof(int8_t);
+      }
+
+      if (k >= 8 * sizeof(int8_t)) {
+        const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+        const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+        const int8x8_t va3 = vld1_s8(a3); a3 += 8;
+
+        const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+        const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+        const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+        const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+        const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+        const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+        const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+        const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+        const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+        const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+        const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+        const int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
+        const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+        const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+        const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+        const int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
+        const int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+        const int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+        const int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+        const int16x8_t vprod3x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
+        vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c0);
+        vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c1);
+        vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c2);
+        vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c3);
+        const int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+        const int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+        const int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+        const int16x8_t vprod3x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
+        vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c0);
+        vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c1);
+        vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c2);
+        vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c3);
+
+        k -= 8 * sizeof(int8_t);
+      }
+
+      if XNN_UNLIKELY(k != 0) {
+        const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+        const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+        const int8x8_t va2 = vld1_s8(a2); a2 = (const int8_t*) ((uintptr_t) a2 + k);
+        const int8x8_t va3 = vld1_s8(a3); a3 = (const int8_t*) ((uintptr_t) a3 + k);
+
+        const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+        const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+        const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+        const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+        const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+        const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+        const int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+        vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c0);
+        const int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+        vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c0);
+
+        if (k > 2 * sizeof(int8_t)) {
+          const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+          vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+          const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+          vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+          const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+          vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+          const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+          vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+          const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+          vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+          const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+          vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+          const int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+          vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c1);
+          const int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+          vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c1);
+
+          if (k > 4 * sizeof(int8_t)) {
+            const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+            const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+            vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+            const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+            vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+            const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+            vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+            const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+            vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+            const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+            vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+            const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+            vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+            const int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+            vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c2);
+            const int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+            vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c2);
+          }
+        }
+      }
+      p -= 4 * sizeof(void*);
+    } while (p != 0);
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+    vacc3x0123 = vqrdmulhq_s32(vacc3x0123, vmultiplier);
+    vacc3x4567 = vqrdmulhq_s32(vacc3x4567, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+    vacc3x0123 = vsraq_n_s32(vacc3x0123, vbicq_s32(vacc3x0123, vzero_shift_mask), 31);
+    vacc3x4567 = vsraq_n_s32(vacc3x4567, vbicq_s32(vacc3x4567, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+    vacc3x0123 = vrshlq_s32(vacc3x0123, vright_shift);
+    vacc3x4567 = vrshlq_s32(vacc3x4567, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
+    int8x16_t vout2x01234567_3x01234567 = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc3x01234567);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567)), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
+    int8x16_t vout2x01234567_3x01234567 = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc3x01234567));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout2x01234567_3x01234567 = vmaxq_s8(vout2x01234567_3x01234567, voutput_min);
+    vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);
+
+    vout2x01234567_3x01234567 = vminq_s8(vout2x01234567_3x01234567, voutput_max);
+    vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);
+
+    if (nc >= 8) {
+      vst1_s8(c3 + 0, vget_high_s8(vout2x01234567_3x01234567));
+      vst1_s8(c2 + 0, vget_low_s8(vout2x01234567_3x01234567));
+      vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
+      vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
+
+      c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c3, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 2); c3 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 0); c2 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c3, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 4); c3 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 0); c2 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c3, vout2x01234567_3x01234567, 8);
+        vst1q_lane_s8(c2, vout2x01234567_3x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/4x8c2-minmax-neon-mull-padal-dup.c b/src/qs8-igemm/gen/4x8c2-minmax-neon-mull-padal-dup.c
new file mode 100644
index 0000000..5f635fa
--- /dev/null
+++ b/src/qs8-igemm/gen/4x8c2-minmax-neon-mull-padal-dup.c
@@ -0,0 +1,345 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/c2-neon-mull-padal-dup.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (4 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+  int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    c3 = c2;
+  }
+
+  do {
+    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    int32x4_t vacc1x0123 = vacc0x0123;
+    int32x4_t vacc1x4567 = vacc0x4567;
+    int32x4_t vacc2x0123 = vacc0x0123;
+    int32x4_t vacc2x4567 = vacc0x4567;
+    int32x4_t vacc3x0123 = vacc0x0123;
+    int32x4_t vacc3x4567 = vacc0x4567;
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      const int8_t* restrict a3 = a[3];
+      if XNN_UNPREDICTABLE(a3 != zero) {
+        a3 = (const int8_t*) ((uintptr_t) a3 + a_offset);
+      }
+      a += 4;
+
+      size_t k = kc;
+
+
+      while (k >= 8 * sizeof(int8_t)) {
+        const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+        const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+        const int8x8_t va3 = vld1_s8(a3); a3 += 8;
+
+        const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+        const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+        const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+        const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+        const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+        const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+        const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+        const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+        const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+        const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+        const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+        const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+        const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+        const int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
+        const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+        const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+        const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+        const int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
+        const int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+        const int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+        const int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+        const int16x8_t vprod3x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
+        vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c0);
+        vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c1);
+        vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c2);
+        vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c3);
+        const int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+        const int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+        const int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+        const int16x8_t vprod3x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
+        vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c0);
+        vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c1);
+        vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c2);
+        vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c3);
+
+        k -= 8 * sizeof(int8_t);
+      }
+
+      if XNN_UNLIKELY(k != 0) {
+        const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+        const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+        const int8x8_t va2 = vld1_s8(a2); a2 = (const int8_t*) ((uintptr_t) a2 + k);
+        const int8x8_t va3 = vld1_s8(a3); a3 = (const int8_t*) ((uintptr_t) a3 + k);
+
+        const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+        const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+        const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+        const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+        const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+        vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+        const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+        vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+        const int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+        vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c0);
+        const int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+        vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c0);
+
+        if (k > 2 * sizeof(int8_t)) {
+          const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+          const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+          vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+          const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+          vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+          const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+          vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+          const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+          vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+          const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+          vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+          const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+          vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+          const int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+          vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c1);
+          const int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+          vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c1);
+
+          if (k > 4 * sizeof(int8_t)) {
+            const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+            const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+            const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+            vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+            const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+            vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+            const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+            vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+            const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+            vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+            const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+            vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+            const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+            vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+            const int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+            vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c2);
+            const int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+            vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c2);
+          }
+        }
+      }
+      p -= 4 * sizeof(void*);
+    } while (p != 0);
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+    vacc3x0123 = vqrdmulhq_s32(vacc3x0123, vmultiplier);
+    vacc3x4567 = vqrdmulhq_s32(vacc3x4567, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+    vacc3x0123 = vsraq_n_s32(vacc3x0123, vbicq_s32(vacc3x0123, vzero_shift_mask), 31);
+    vacc3x4567 = vsraq_n_s32(vacc3x4567, vbicq_s32(vacc3x4567, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+    vacc3x0123 = vrshlq_s32(vacc3x0123, vright_shift);
+    vacc3x4567 = vrshlq_s32(vacc3x4567, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
+    int8x16_t vout2x01234567_3x01234567 = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc3x01234567);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567)), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
+    int8x16_t vout2x01234567_3x01234567 = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc3x01234567));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout2x01234567_3x01234567 = vmaxq_s8(vout2x01234567_3x01234567, voutput_min);
+    vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);
+
+    vout2x01234567_3x01234567 = vminq_s8(vout2x01234567_3x01234567, voutput_max);
+    vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);
+
+    if (nc >= 8) {
+      vst1_s8(c3 + 0, vget_high_s8(vout2x01234567_3x01234567));
+      vst1_s8(c2 + 0, vget_low_s8(vout2x01234567_3x01234567));
+      vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
+      vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
+
+      c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c3, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 2); c3 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 0); c2 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c3, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 4); c3 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 0); c2 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c3, vout2x01234567_3x01234567, 8);
+        vst1q_lane_s8(c2, vout2x01234567_3x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/4x8c4-minmax-neondot.c b/src/qs8-igemm/gen/4x8c4-minmax-neondot.c
index 3f66a4b..989ba1b 100644
--- a/src/qs8-igemm/gen/4x8c4-minmax-neondot.c
+++ b/src/qs8-igemm/gen/4x8c4-minmax-neondot.c
@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_4x8c4__neondot(
@@ -40,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 4);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
@@ -95,10 +96,10 @@
         const int8x8_t va3x01234567 = vld1_s8(a3); a3 += 8;
 
         // Load a 8x8 block of weights.
-        const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
+        const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
 
         // Multiply-accumulate: 4x8 * 8x8 --> 4x8.
         vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb0123x0123, va0x01234567, 0);
@@ -120,7 +121,7 @@
 
         k -= 8 * sizeof(int8_t);
       }
-      // Handle up to 7 final positions of `k`
+      // Handle up to 4 final positions of `k`
       if XNN_UNLIKELY(k != 0) {
         // Load a 4x4 block of activations.
         const int8x8_t va0x01234567 = vld1_s8(a0);
@@ -129,8 +130,8 @@
         const int8x8_t va3x01234567 = vld1_s8(a3);
 
         // Load a 4x8 block of weights.
-        const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
+        const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
 
         // Multiply-accumulate: 4x4 * 4x8 --> 4x8.
         vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb0123x0123, va0x01234567, 0);
@@ -141,22 +142,6 @@
         vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb0123x4567, va2x01234567, 0);
         vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb0123x0123, va3x01234567, 0);
         vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb0123x4567, va3x01234567, 0);
-
-        if (k > 4) {
-          // Load a 4x8 block of weights.
-          const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-          const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-
-          // Multiply-accumulate: 4x4 * 4x8 --> 4x8.
-          vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
-          vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
-          vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1);
-          vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1);
-          vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1);
-          vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1);
-          vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb4567x0123, va3x01234567, 1);
-          vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb4567x4567, va3x01234567, 1);
-        }
       }
       p -= 4 * sizeof(void*);
     } while (p != 0);
diff --git a/src/qs8-igemm/gen/4x8c8-minmax-neon-mlal-padal.c b/src/qs8-igemm/gen/4x8c8-minmax-neon-mlal-padal.c
new file mode 100644
index 0000000..4fb03b0
--- /dev/null
+++ b/src/qs8-igemm/gen/4x8c8-minmax-neon-mlal-padal.c
@@ -0,0 +1,508 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/c8-neon-mull-padal.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (4 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+  int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    c3 = c2;
+  }
+
+  do {
+    int32x4_t vacc0x0 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x1 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x2 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x3 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x4 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x5 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc1x0 = vacc0x0;
+    int32x4_t vacc1x1 = vacc0x1;
+    int32x4_t vacc1x2 = vacc0x2;
+    int32x4_t vacc1x3 = vacc0x3;
+    int32x4_t vacc1x4 = vacc0x4;
+    int32x4_t vacc1x5 = vacc0x5;
+    int32x4_t vacc1x6 = vacc0x6;
+    int32x4_t vacc1x7 = vacc0x7;
+    int32x4_t vacc2x0 = vacc0x0;
+    int32x4_t vacc2x1 = vacc0x1;
+    int32x4_t vacc2x2 = vacc0x2;
+    int32x4_t vacc2x3 = vacc0x3;
+    int32x4_t vacc2x4 = vacc0x4;
+    int32x4_t vacc2x5 = vacc0x5;
+    int32x4_t vacc2x6 = vacc0x6;
+    int32x4_t vacc2x7 = vacc0x7;
+    int32x4_t vacc3x0 = vacc0x0;
+    int32x4_t vacc3x1 = vacc0x1;
+    int32x4_t vacc3x2 = vacc0x2;
+    int32x4_t vacc3x3 = vacc0x3;
+    int32x4_t vacc3x4 = vacc0x4;
+    int32x4_t vacc3x5 = vacc0x5;
+    int32x4_t vacc3x6 = vacc0x6;
+    int32x4_t vacc3x7 = vacc0x7;
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      const int8_t* restrict a3 = a[3];
+      if XNN_UNPREDICTABLE(a3 != zero) {
+        a3 = (const int8_t*) ((uintptr_t) a3 + a_offset);
+      }
+      a += 4;
+
+      size_t k = kc;
+      // 2x partial unrolled loop to load 16 bytes at a time using MLA.
+      while (k >= 16 * sizeof(int8_t)) {
+        const int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va0x1 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va1x0 = vld1_s8(a1); a1 += 8;
+        const int8x8_t va1x1 = vld1_s8(a1); a1 += 8;
+        const int8x8_t va2x0 = vld1_s8(a2); a2 += 8;
+        const int8x8_t va2x1 = vld1_s8(a2); a2 += 8;
+        const int8x8_t va3x0 = vld1_s8(a3); a3 += 8;
+        const int8x8_t va3x1 = vld1_s8(a3); a3 += 8;
+
+        const int8x8_t vb0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb4x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb5x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb6x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        const int8x8_t vb7x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+
+        const int8x8_t vb0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x0 = vmull_s8(vb0x0, va0x0);
+        int16x8_t vprod1x0 = vmull_s8(vb0x0, va1x0);
+        int16x8_t vprod2x0 = vmull_s8(vb0x0, va2x0);
+        int16x8_t vprod3x0 = vmull_s8(vb0x0, va3x0);
+        vprod0x0 = vmlal_s8(vprod0x0, vb0x1, va0x1);
+        vprod1x0 = vmlal_s8(vprod1x0, vb0x1, va1x1);
+        vprod2x0 = vmlal_s8(vprod2x0, vb0x1, va2x1);
+        vprod3x0 = vmlal_s8(vprod3x0, vb0x1, va3x1);
+        vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+        vacc1x0 = vpadalq_s16(vacc1x0, vprod1x0);
+        vacc2x0 = vpadalq_s16(vacc2x0, vprod2x0);
+        vacc3x0 = vpadalq_s16(vacc3x0, vprod3x0);
+        const int8x8_t vb1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x1 = vmull_s8(vb1x0, va0x0);
+        int16x8_t vprod1x1 = vmull_s8(vb1x0, va1x0);
+        int16x8_t vprod2x1 = vmull_s8(vb1x0, va2x0);
+        int16x8_t vprod3x1 = vmull_s8(vb1x0, va3x0);
+        vprod0x1 = vmlal_s8(vprod0x1, vb1x1, va0x1);
+        vprod1x1 = vmlal_s8(vprod1x1, vb1x1, va1x1);
+        vprod2x1 = vmlal_s8(vprod2x1, vb1x1, va2x1);
+        vprod3x1 = vmlal_s8(vprod3x1, vb1x1, va3x1);
+        vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+        vacc1x1 = vpadalq_s16(vacc1x1, vprod1x1);
+        vacc2x1 = vpadalq_s16(vacc2x1, vprod2x1);
+        vacc3x1 = vpadalq_s16(vacc3x1, vprod3x1);
+        const int8x8_t vb2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x2 = vmull_s8(vb2x0, va0x0);
+        int16x8_t vprod1x2 = vmull_s8(vb2x0, va1x0);
+        int16x8_t vprod2x2 = vmull_s8(vb2x0, va2x0);
+        int16x8_t vprod3x2 = vmull_s8(vb2x0, va3x0);
+        vprod0x2 = vmlal_s8(vprod0x2, vb2x1, va0x1);
+        vprod1x2 = vmlal_s8(vprod1x2, vb2x1, va1x1);
+        vprod2x2 = vmlal_s8(vprod2x2, vb2x1, va2x1);
+        vprod3x2 = vmlal_s8(vprod3x2, vb2x1, va3x1);
+        vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+        vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2);
+        vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2);
+        vacc3x2 = vpadalq_s16(vacc3x2, vprod3x2);
+        const int8x8_t vb3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x3 = vmull_s8(vb3x0, va0x0);
+        int16x8_t vprod1x3 = vmull_s8(vb3x0, va1x0);
+        int16x8_t vprod2x3 = vmull_s8(vb3x0, va2x0);
+        int16x8_t vprod3x3 = vmull_s8(vb3x0, va3x0);
+        vprod0x3 = vmlal_s8(vprod0x3, vb3x1, va0x1);
+        vprod1x3 = vmlal_s8(vprod1x3, vb3x1, va1x1);
+        vprod2x3 = vmlal_s8(vprod2x3, vb3x1, va2x1);
+        vprod3x3 = vmlal_s8(vprod3x3, vb3x1, va3x1);
+        vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+        vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3);
+        vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3);
+        vacc3x3 = vpadalq_s16(vacc3x3, vprod3x3);
+        const int8x8_t vb4x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x4 = vmull_s8(vb4x0, va0x0);
+        int16x8_t vprod1x4 = vmull_s8(vb4x0, va1x0);
+        int16x8_t vprod2x4 = vmull_s8(vb4x0, va2x0);
+        int16x8_t vprod3x4 = vmull_s8(vb4x0, va3x0);
+        vprod0x4 = vmlal_s8(vprod0x4, vb4x1, va0x1);
+        vprod1x4 = vmlal_s8(vprod1x4, vb4x1, va1x1);
+        vprod2x4 = vmlal_s8(vprod2x4, vb4x1, va2x1);
+        vprod3x4 = vmlal_s8(vprod3x4, vb4x1, va3x1);
+        vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+        vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4);
+        vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4);
+        vacc3x4 = vpadalq_s16(vacc3x4, vprod3x4);
+        const int8x8_t vb5x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x5 = vmull_s8(vb5x0, va0x0);
+        int16x8_t vprod1x5 = vmull_s8(vb5x0, va1x0);
+        int16x8_t vprod2x5 = vmull_s8(vb5x0, va2x0);
+        int16x8_t vprod3x5 = vmull_s8(vb5x0, va3x0);
+        vprod0x5 = vmlal_s8(vprod0x5, vb5x1, va0x1);
+        vprod1x5 = vmlal_s8(vprod1x5, vb5x1, va1x1);
+        vprod2x5 = vmlal_s8(vprod2x5, vb5x1, va2x1);
+        vprod3x5 = vmlal_s8(vprod3x5, vb5x1, va3x1);
+        vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+        vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5);
+        vacc2x5 = vpadalq_s16(vacc2x5, vprod2x5);
+        vacc3x5 = vpadalq_s16(vacc3x5, vprod3x5);
+        const int8x8_t vb6x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x6 = vmull_s8(vb6x0, va0x0);
+        int16x8_t vprod1x6 = vmull_s8(vb6x0, va1x0);
+        int16x8_t vprod2x6 = vmull_s8(vb6x0, va2x0);
+        int16x8_t vprod3x6 = vmull_s8(vb6x0, va3x0);
+        vprod0x6 = vmlal_s8(vprod0x6, vb6x1, va0x1);
+        vprod1x6 = vmlal_s8(vprod1x6, vb6x1, va1x1);
+        vprod2x6 = vmlal_s8(vprod2x6, vb6x1, va2x1);
+        vprod3x6 = vmlal_s8(vprod3x6, vb6x1, va3x1);
+        vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+        vacc1x6 = vpadalq_s16(vacc1x6, vprod1x6);
+        vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6);
+        vacc3x6 = vpadalq_s16(vacc3x6, vprod3x6);
+        const int8x8_t vb7x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(  int8_t));
+        int16x8_t vprod0x7 = vmull_s8(vb7x0, va0x0);
+        int16x8_t vprod1x7 = vmull_s8(vb7x0, va1x0);
+        int16x8_t vprod2x7 = vmull_s8(vb7x0, va2x0);
+        int16x8_t vprod3x7 = vmull_s8(vb7x0, va3x0);
+        vprod0x7 = vmlal_s8(vprod0x7, vb7x1, va0x1);
+        vprod1x7 = vmlal_s8(vprod1x7, vb7x1, va1x1);
+        vprod2x7 = vmlal_s8(vprod2x7, vb7x1, va2x1);
+        vprod3x7 = vmlal_s8(vprod3x7, vb7x1, va3x1);
+        vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+        vacc1x7 = vpadalq_s16(vacc1x7, vprod1x7);
+        vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7);
+        vacc3x7 = vpadalq_s16(vacc3x7, vprod3x7);
+
+        k -= 16 * sizeof(int8_t);
+      }
+
+      // Handle 8 bytes at a time using MUL.
+      if (k > 0) {
+        const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+        const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+        const int8x8_t va3 = vld1_s8(a3); a3 += 8;
+
+        const int8x8_t vb0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x0 = vmull_s8(vb0, va0);
+        const int16x8_t vprod1x0 = vmull_s8(vb0, va1);
+        const int16x8_t vprod2x0 = vmull_s8(vb0, va2);
+        const int16x8_t vprod3x0 = vmull_s8(vb0, va3);
+        vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+        vacc1x0 = vpadalq_s16(vacc1x0, vprod1x0);
+        vacc2x0 = vpadalq_s16(vacc2x0, vprod2x0);
+        vacc3x0 = vpadalq_s16(vacc3x0, vprod3x0);
+        const int8x8_t vb1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x1 = vmull_s8(vb1, va0);
+        const int16x8_t vprod1x1 = vmull_s8(vb1, va1);
+        const int16x8_t vprod2x1 = vmull_s8(vb1, va2);
+        const int16x8_t vprod3x1 = vmull_s8(vb1, va3);
+        vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+        vacc1x1 = vpadalq_s16(vacc1x1, vprod1x1);
+        vacc2x1 = vpadalq_s16(vacc2x1, vprod2x1);
+        vacc3x1 = vpadalq_s16(vacc3x1, vprod3x1);
+        const int8x8_t vb2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x2 = vmull_s8(vb2, va0);
+        const int16x8_t vprod1x2 = vmull_s8(vb2, va1);
+        const int16x8_t vprod2x2 = vmull_s8(vb2, va2);
+        const int16x8_t vprod3x2 = vmull_s8(vb2, va3);
+        vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+        vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2);
+        vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2);
+        vacc3x2 = vpadalq_s16(vacc3x2, vprod3x2);
+        const int8x8_t vb3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x3 = vmull_s8(vb3, va0);
+        const int16x8_t vprod1x3 = vmull_s8(vb3, va1);
+        const int16x8_t vprod2x3 = vmull_s8(vb3, va2);
+        const int16x8_t vprod3x3 = vmull_s8(vb3, va3);
+        vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+        vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3);
+        vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3);
+        vacc3x3 = vpadalq_s16(vacc3x3, vprod3x3);
+        const int8x8_t vb4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x4 = vmull_s8(vb4, va0);
+        const int16x8_t vprod1x4 = vmull_s8(vb4, va1);
+        const int16x8_t vprod2x4 = vmull_s8(vb4, va2);
+        const int16x8_t vprod3x4 = vmull_s8(vb4, va3);
+        vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+        vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4);
+        vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4);
+        vacc3x4 = vpadalq_s16(vacc3x4, vprod3x4);
+        const int8x8_t vb5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x5 = vmull_s8(vb5, va0);
+        const int16x8_t vprod1x5 = vmull_s8(vb5, va1);
+        const int16x8_t vprod2x5 = vmull_s8(vb5, va2);
+        const int16x8_t vprod3x5 = vmull_s8(vb5, va3);
+        vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+        vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5);
+        vacc2x5 = vpadalq_s16(vacc2x5, vprod2x5);
+        vacc3x5 = vpadalq_s16(vacc3x5, vprod3x5);
+        const int8x8_t vb6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x6 = vmull_s8(vb6, va0);
+        const int16x8_t vprod1x6 = vmull_s8(vb6, va1);
+        const int16x8_t vprod2x6 = vmull_s8(vb6, va2);
+        const int16x8_t vprod3x6 = vmull_s8(vb6, va3);
+        vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+        vacc1x6 = vpadalq_s16(vacc1x6, vprod1x6);
+        vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6);
+        vacc3x6 = vpadalq_s16(vacc3x6, vprod3x6);
+        const int8x8_t vb7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x7 = vmull_s8(vb7, va0);
+        const int16x8_t vprod1x7 = vmull_s8(vb7, va1);
+        const int16x8_t vprod2x7 = vmull_s8(vb7, va2);
+        const int16x8_t vprod3x7 = vmull_s8(vb7, va3);
+        vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+        vacc1x7 = vpadalq_s16(vacc1x7, vprod1x7);
+        vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7);
+        vacc3x7 = vpadalq_s16(vacc3x7, vprod3x7);
+
+        k -= 8 * sizeof(int8_t);
+      }
+
+      p -= 4 * sizeof(void*);
+    } while (p != 0);
+
+#if XNN_ARCH_ARM64
+    const int32x4_t vsum0x01 = vpaddq_s32(vacc0x0, vacc0x1);
+    const int32x4_t vsum0x23 = vpaddq_s32(vacc0x2, vacc0x3);
+    const int32x4_t vsum0x45 = vpaddq_s32(vacc0x4, vacc0x5);
+    const int32x4_t vsum0x67 = vpaddq_s32(vacc0x6, vacc0x7);
+    const int32x4_t vsum1x01 = vpaddq_s32(vacc1x0, vacc1x1);
+    const int32x4_t vsum1x23 = vpaddq_s32(vacc1x2, vacc1x3);
+    const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5);
+    const int32x4_t vsum1x67 = vpaddq_s32(vacc1x6, vacc1x7);
+    const int32x4_t vsum2x01 = vpaddq_s32(vacc2x0, vacc2x1);
+    const int32x4_t vsum2x23 = vpaddq_s32(vacc2x2, vacc2x3);
+    const int32x4_t vsum2x45 = vpaddq_s32(vacc2x4, vacc2x5);
+    const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7);
+    const int32x4_t vsum3x01 = vpaddq_s32(vacc3x0, vacc3x1);
+    const int32x4_t vsum3x23 = vpaddq_s32(vacc3x2, vacc3x3);
+    const int32x4_t vsum3x45 = vpaddq_s32(vacc3x4, vacc3x5);
+    const int32x4_t vsum3x67 = vpaddq_s32(vacc3x6, vacc3x7);
+    int32x4_t vacc0x0123 = vpaddq_s32(vsum0x01, vsum0x23);
+    int32x4_t vacc0x4567 = vpaddq_s32(vsum0x45, vsum0x67);
+    int32x4_t vacc1x0123 = vpaddq_s32(vsum1x01, vsum1x23);
+    int32x4_t vacc1x4567 = vpaddq_s32(vsum1x45, vsum1x67);
+    int32x4_t vacc2x0123 = vpaddq_s32(vsum2x01, vsum2x23);
+    int32x4_t vacc2x4567 = vpaddq_s32(vsum2x45, vsum2x67);
+    int32x4_t vacc3x0123 = vpaddq_s32(vsum3x01, vsum3x23);
+    int32x4_t vacc3x4567 = vpaddq_s32(vsum3x45, vsum3x67);
+#else
+    const int32x2_t vpsum0x0 = vadd_s32(vget_low_s32(vacc0x0), vget_high_s32(vacc0x0));
+    const int32x2_t vpsum0x1 = vadd_s32(vget_low_s32(vacc0x1), vget_high_s32(vacc0x1));
+    const int32x2_t vpsum0x2 = vadd_s32(vget_low_s32(vacc0x2), vget_high_s32(vacc0x2));
+    const int32x2_t vpsum0x3 = vadd_s32(vget_low_s32(vacc0x3), vget_high_s32(vacc0x3));
+    const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1);
+    const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3);
+    int32x4_t vacc0x0123 = vcombine_s32(vsum0x01, vsum0x23 );
+    const int32x2_t vpsum0x4 = vadd_s32(vget_low_s32(vacc0x4), vget_high_s32(vacc0x4));
+    const int32x2_t vpsum0x5 = vadd_s32(vget_low_s32(vacc0x5), vget_high_s32(vacc0x5));
+    const int32x2_t vpsum0x6 = vadd_s32(vget_low_s32(vacc0x6), vget_high_s32(vacc0x6));
+    const int32x2_t vpsum0x7 = vadd_s32(vget_low_s32(vacc0x7), vget_high_s32(vacc0x7));
+    const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5);
+    const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7);
+    int32x4_t vacc0x4567 = vcombine_s32(vsum0x45, vsum0x67 );
+    const int32x2_t vpsum1x0 = vadd_s32(vget_low_s32(vacc1x0), vget_high_s32(vacc1x0));
+    const int32x2_t vpsum1x1 = vadd_s32(vget_low_s32(vacc1x1), vget_high_s32(vacc1x1));
+    const int32x2_t vpsum1x2 = vadd_s32(vget_low_s32(vacc1x2), vget_high_s32(vacc1x2));
+    const int32x2_t vpsum1x3 = vadd_s32(vget_low_s32(vacc1x3), vget_high_s32(vacc1x3));
+    const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1);
+    const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3);
+    int32x4_t vacc1x0123 = vcombine_s32(vsum1x01, vsum1x23 );
+    const int32x2_t vpsum1x4 = vadd_s32(vget_low_s32(vacc1x4), vget_high_s32(vacc1x4));
+    const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5));
+    const int32x2_t vpsum1x6 = vadd_s32(vget_low_s32(vacc1x6), vget_high_s32(vacc1x6));
+    const int32x2_t vpsum1x7 = vadd_s32(vget_low_s32(vacc1x7), vget_high_s32(vacc1x7));
+    const int32x2_t vsum1x45 = vpadd_s32(vpsum1x4, vpsum1x5);
+    const int32x2_t vsum1x67 = vpadd_s32(vpsum1x6, vpsum1x7);
+    int32x4_t vacc1x4567 = vcombine_s32(vsum1x45, vsum1x67 );
+    const int32x2_t vpsum2x0 = vadd_s32(vget_low_s32(vacc2x0), vget_high_s32(vacc2x0));
+    const int32x2_t vpsum2x1 = vadd_s32(vget_low_s32(vacc2x1), vget_high_s32(vacc2x1));
+    const int32x2_t vpsum2x2 = vadd_s32(vget_low_s32(vacc2x2), vget_high_s32(vacc2x2));
+    const int32x2_t vpsum2x3 = vadd_s32(vget_low_s32(vacc2x3), vget_high_s32(vacc2x3));
+    const int32x2_t vsum2x01 = vpadd_s32(vpsum2x0, vpsum2x1);
+    const int32x2_t vsum2x23 = vpadd_s32(vpsum2x2, vpsum2x3);
+    int32x4_t vacc2x0123 = vcombine_s32(vsum2x01, vsum2x23 );
+    const int32x2_t vpsum2x4 = vadd_s32(vget_low_s32(vacc2x4), vget_high_s32(vacc2x4));
+    const int32x2_t vpsum2x5 = vadd_s32(vget_low_s32(vacc2x5), vget_high_s32(vacc2x5));
+    const int32x2_t vpsum2x6 = vadd_s32(vget_low_s32(vacc2x6), vget_high_s32(vacc2x6));
+    const int32x2_t vpsum2x7 = vadd_s32(vget_low_s32(vacc2x7), vget_high_s32(vacc2x7));
+    const int32x2_t vsum2x45 = vpadd_s32(vpsum2x4, vpsum2x5);
+    const int32x2_t vsum2x67 = vpadd_s32(vpsum2x6, vpsum2x7);
+    int32x4_t vacc2x4567 = vcombine_s32(vsum2x45, vsum2x67 );
+    const int32x2_t vpsum3x0 = vadd_s32(vget_low_s32(vacc3x0), vget_high_s32(vacc3x0));
+    const int32x2_t vpsum3x1 = vadd_s32(vget_low_s32(vacc3x1), vget_high_s32(vacc3x1));
+    const int32x2_t vpsum3x2 = vadd_s32(vget_low_s32(vacc3x2), vget_high_s32(vacc3x2));
+    const int32x2_t vpsum3x3 = vadd_s32(vget_low_s32(vacc3x3), vget_high_s32(vacc3x3));
+    const int32x2_t vsum3x01 = vpadd_s32(vpsum3x0, vpsum3x1);
+    const int32x2_t vsum3x23 = vpadd_s32(vpsum3x2, vpsum3x3);
+    int32x4_t vacc3x0123 = vcombine_s32(vsum3x01, vsum3x23 );
+    const int32x2_t vpsum3x4 = vadd_s32(vget_low_s32(vacc3x4), vget_high_s32(vacc3x4));
+    const int32x2_t vpsum3x5 = vadd_s32(vget_low_s32(vacc3x5), vget_high_s32(vacc3x5));
+    const int32x2_t vpsum3x6 = vadd_s32(vget_low_s32(vacc3x6), vget_high_s32(vacc3x6));
+    const int32x2_t vpsum3x7 = vadd_s32(vget_low_s32(vacc3x7), vget_high_s32(vacc3x7));
+    const int32x2_t vsum3x45 = vpadd_s32(vpsum3x4, vpsum3x5);
+    const int32x2_t vsum3x67 = vpadd_s32(vpsum3x6, vpsum3x7);
+    int32x4_t vacc3x4567 = vcombine_s32(vsum3x45, vsum3x67 );
+#endif
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+    vacc3x0123 = vqrdmulhq_s32(vacc3x0123, vmultiplier);
+    vacc3x4567 = vqrdmulhq_s32(vacc3x4567, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+    vacc3x0123 = vsraq_n_s32(vacc3x0123, vbicq_s32(vacc3x0123, vzero_shift_mask), 31);
+    vacc3x4567 = vsraq_n_s32(vacc3x4567, vbicq_s32(vacc3x4567, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+    vacc3x0123 = vrshlq_s32(vacc3x0123, vright_shift);
+    vacc3x4567 = vrshlq_s32(vacc3x4567, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
+    int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
+    int8x16_t vout2x01234567_3x01234567 = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc3x01234567);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567)), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
+    int8x16_t vout2x01234567_3x01234567 = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc3x01234567));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout2x01234567_3x01234567 = vmaxq_s8(vout2x01234567_3x01234567, voutput_min);
+    vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);
+
+    vout2x01234567_3x01234567 = vminq_s8(vout2x01234567_3x01234567, voutput_max);
+    vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);
+
+    if (nc >= 8) {
+      vst1_s8(c3 + 0, vget_high_s8(vout2x01234567_3x01234567));
+      vst1_s8(c2 + 0, vget_low_s8(vout2x01234567_3x01234567));
+      vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
+      vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
+
+      c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c3, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 2); c3 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 0); c2 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c3, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 4); c3 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 0); c2 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c3, vout2x01234567_3x01234567, 8);
+        vst1q_lane_s8(c2, vout2x01234567_3x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/4x8c8-minmax-neon-mull-padal.c b/src/qs8-igemm/gen/4x8c8-minmax-neon-mull-padal.c
new file mode 100644
index 0000000..87ce9b5
--- /dev/null
+++ b/src/qs8-igemm/gen/4x8c8-minmax-neon-mull-padal.c
@@ -0,0 +1,381 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/c8-neon-mull-padal.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (4 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+  int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    c3 = c2;
+  }
+
+  do {
+    int32x4_t vacc0x0 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x1 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x2 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x3 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x4 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x5 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+    int32x4_t vacc1x0 = vacc0x0;
+    int32x4_t vacc1x1 = vacc0x1;
+    int32x4_t vacc1x2 = vacc0x2;
+    int32x4_t vacc1x3 = vacc0x3;
+    int32x4_t vacc1x4 = vacc0x4;
+    int32x4_t vacc1x5 = vacc0x5;
+    int32x4_t vacc1x6 = vacc0x6;
+    int32x4_t vacc1x7 = vacc0x7;
+    int32x4_t vacc2x0 = vacc0x0;
+    int32x4_t vacc2x1 = vacc0x1;
+    int32x4_t vacc2x2 = vacc0x2;
+    int32x4_t vacc2x3 = vacc0x3;
+    int32x4_t vacc2x4 = vacc0x4;
+    int32x4_t vacc2x5 = vacc0x5;
+    int32x4_t vacc2x6 = vacc0x6;
+    int32x4_t vacc2x7 = vacc0x7;
+    int32x4_t vacc3x0 = vacc0x0;
+    int32x4_t vacc3x1 = vacc0x1;
+    int32x4_t vacc3x2 = vacc0x2;
+    int32x4_t vacc3x3 = vacc0x3;
+    int32x4_t vacc3x4 = vacc0x4;
+    int32x4_t vacc3x5 = vacc0x5;
+    int32x4_t vacc3x6 = vacc0x6;
+    int32x4_t vacc3x7 = vacc0x7;
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      const int8_t* restrict a3 = a[3];
+      if XNN_UNPREDICTABLE(a3 != zero) {
+        a3 = (const int8_t*) ((uintptr_t) a3 + a_offset);
+      }
+      a += 4;
+
+      size_t k = kc;
+
+      // Handle 8 bytes at a time using MUL.
+      while (k > 0) {
+        const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+        const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+        const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+        const int8x8_t va3 = vld1_s8(a3); a3 += 8;
+
+        const int8x8_t vb0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x0 = vmull_s8(vb0, va0);
+        const int16x8_t vprod1x0 = vmull_s8(vb0, va1);
+        const int16x8_t vprod2x0 = vmull_s8(vb0, va2);
+        const int16x8_t vprod3x0 = vmull_s8(vb0, va3);
+        vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+        vacc1x0 = vpadalq_s16(vacc1x0, vprod1x0);
+        vacc2x0 = vpadalq_s16(vacc2x0, vprod2x0);
+        vacc3x0 = vpadalq_s16(vacc3x0, vprod3x0);
+        const int8x8_t vb1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x1 = vmull_s8(vb1, va0);
+        const int16x8_t vprod1x1 = vmull_s8(vb1, va1);
+        const int16x8_t vprod2x1 = vmull_s8(vb1, va2);
+        const int16x8_t vprod3x1 = vmull_s8(vb1, va3);
+        vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+        vacc1x1 = vpadalq_s16(vacc1x1, vprod1x1);
+        vacc2x1 = vpadalq_s16(vacc2x1, vprod2x1);
+        vacc3x1 = vpadalq_s16(vacc3x1, vprod3x1);
+        const int8x8_t vb2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x2 = vmull_s8(vb2, va0);
+        const int16x8_t vprod1x2 = vmull_s8(vb2, va1);
+        const int16x8_t vprod2x2 = vmull_s8(vb2, va2);
+        const int16x8_t vprod3x2 = vmull_s8(vb2, va3);
+        vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+        vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2);
+        vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2);
+        vacc3x2 = vpadalq_s16(vacc3x2, vprod3x2);
+        const int8x8_t vb3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x3 = vmull_s8(vb3, va0);
+        const int16x8_t vprod1x3 = vmull_s8(vb3, va1);
+        const int16x8_t vprod2x3 = vmull_s8(vb3, va2);
+        const int16x8_t vprod3x3 = vmull_s8(vb3, va3);
+        vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+        vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3);
+        vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3);
+        vacc3x3 = vpadalq_s16(vacc3x3, vprod3x3);
+        const int8x8_t vb4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x4 = vmull_s8(vb4, va0);
+        const int16x8_t vprod1x4 = vmull_s8(vb4, va1);
+        const int16x8_t vprod2x4 = vmull_s8(vb4, va2);
+        const int16x8_t vprod3x4 = vmull_s8(vb4, va3);
+        vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+        vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4);
+        vacc2x4 = vpadalq_s16(vacc2x4, vprod2x4);
+        vacc3x4 = vpadalq_s16(vacc3x4, vprod3x4);
+        const int8x8_t vb5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x5 = vmull_s8(vb5, va0);
+        const int16x8_t vprod1x5 = vmull_s8(vb5, va1);
+        const int16x8_t vprod2x5 = vmull_s8(vb5, va2);
+        const int16x8_t vprod3x5 = vmull_s8(vb5, va3);
+        vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+        vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5);
+        vacc2x5 = vpadalq_s16(vacc2x5, vprod2x5);
+        vacc3x5 = vpadalq_s16(vacc3x5, vprod3x5);
+        const int8x8_t vb6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x6 = vmull_s8(vb6, va0);
+        const int16x8_t vprod1x6 = vmull_s8(vb6, va1);
+        const int16x8_t vprod2x6 = vmull_s8(vb6, va2);
+        const int16x8_t vprod3x6 = vmull_s8(vb6, va3);
+        vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+        vacc1x6 = vpadalq_s16(vacc1x6, vprod1x6);
+        vacc2x6 = vpadalq_s16(vacc2x6, vprod2x6);
+        vacc3x6 = vpadalq_s16(vacc3x6, vprod3x6);
+        const int8x8_t vb7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+        const int16x8_t vprod0x7 = vmull_s8(vb7, va0);
+        const int16x8_t vprod1x7 = vmull_s8(vb7, va1);
+        const int16x8_t vprod2x7 = vmull_s8(vb7, va2);
+        const int16x8_t vprod3x7 = vmull_s8(vb7, va3);
+        vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+        vacc1x7 = vpadalq_s16(vacc1x7, vprod1x7);
+        vacc2x7 = vpadalq_s16(vacc2x7, vprod2x7);
+        vacc3x7 = vpadalq_s16(vacc3x7, vprod3x7);
+
+        k -= 8 * sizeof(int8_t);
+      }
+
+      p -= 4 * sizeof(void*);
+    } while (p != 0);
+
+#if XNN_ARCH_ARM64
+    const int32x4_t vsum0x01 = vpaddq_s32(vacc0x0, vacc0x1);
+    const int32x4_t vsum0x23 = vpaddq_s32(vacc0x2, vacc0x3);
+    const int32x4_t vsum0x45 = vpaddq_s32(vacc0x4, vacc0x5);
+    const int32x4_t vsum0x67 = vpaddq_s32(vacc0x6, vacc0x7);
+    const int32x4_t vsum1x01 = vpaddq_s32(vacc1x0, vacc1x1);
+    const int32x4_t vsum1x23 = vpaddq_s32(vacc1x2, vacc1x3);
+    const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5);
+    const int32x4_t vsum1x67 = vpaddq_s32(vacc1x6, vacc1x7);
+    const int32x4_t vsum2x01 = vpaddq_s32(vacc2x0, vacc2x1);
+    const int32x4_t vsum2x23 = vpaddq_s32(vacc2x2, vacc2x3);
+    const int32x4_t vsum2x45 = vpaddq_s32(vacc2x4, vacc2x5);
+    const int32x4_t vsum2x67 = vpaddq_s32(vacc2x6, vacc2x7);
+    const int32x4_t vsum3x01 = vpaddq_s32(vacc3x0, vacc3x1);
+    const int32x4_t vsum3x23 = vpaddq_s32(vacc3x2, vacc3x3);
+    const int32x4_t vsum3x45 = vpaddq_s32(vacc3x4, vacc3x5);
+    const int32x4_t vsum3x67 = vpaddq_s32(vacc3x6, vacc3x7);
+    int32x4_t vacc0x0123 = vpaddq_s32(vsum0x01, vsum0x23);
+    int32x4_t vacc0x4567 = vpaddq_s32(vsum0x45, vsum0x67);
+    int32x4_t vacc1x0123 = vpaddq_s32(vsum1x01, vsum1x23);
+    int32x4_t vacc1x4567 = vpaddq_s32(vsum1x45, vsum1x67);
+    int32x4_t vacc2x0123 = vpaddq_s32(vsum2x01, vsum2x23);
+    int32x4_t vacc2x4567 = vpaddq_s32(vsum2x45, vsum2x67);
+    int32x4_t vacc3x0123 = vpaddq_s32(vsum3x01, vsum3x23);
+    int32x4_t vacc3x4567 = vpaddq_s32(vsum3x45, vsum3x67);
+#else
+    const int32x2_t vpsum0x0 = vadd_s32(vget_low_s32(vacc0x0), vget_high_s32(vacc0x0));
+    const int32x2_t vpsum0x1 = vadd_s32(vget_low_s32(vacc0x1), vget_high_s32(vacc0x1));
+    const int32x2_t vpsum0x2 = vadd_s32(vget_low_s32(vacc0x2), vget_high_s32(vacc0x2));
+    const int32x2_t vpsum0x3 = vadd_s32(vget_low_s32(vacc0x3), vget_high_s32(vacc0x3));
+    const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1);
+    const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3);
+    int32x4_t vacc0x0123 = vcombine_s32(vsum0x01, vsum0x23 );
+    const int32x2_t vpsum0x4 = vadd_s32(vget_low_s32(vacc0x4), vget_high_s32(vacc0x4));
+    const int32x2_t vpsum0x5 = vadd_s32(vget_low_s32(vacc0x5), vget_high_s32(vacc0x5));
+    const int32x2_t vpsum0x6 = vadd_s32(vget_low_s32(vacc0x6), vget_high_s32(vacc0x6));
+    const int32x2_t vpsum0x7 = vadd_s32(vget_low_s32(vacc0x7), vget_high_s32(vacc0x7));
+    const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5);
+    const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7);
+    int32x4_t vacc0x4567 = vcombine_s32(vsum0x45, vsum0x67 );
+    const int32x2_t vpsum1x0 = vadd_s32(vget_low_s32(vacc1x0), vget_high_s32(vacc1x0));
+    const int32x2_t vpsum1x1 = vadd_s32(vget_low_s32(vacc1x1), vget_high_s32(vacc1x1));
+    const int32x2_t vpsum1x2 = vadd_s32(vget_low_s32(vacc1x2), vget_high_s32(vacc1x2));
+    const int32x2_t vpsum1x3 = vadd_s32(vget_low_s32(vacc1x3), vget_high_s32(vacc1x3));
+    const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1);
+    const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3);
+    int32x4_t vacc1x0123 = vcombine_s32(vsum1x01, vsum1x23 );
+    const int32x2_t vpsum1x4 = vadd_s32(vget_low_s32(vacc1x4), vget_high_s32(vacc1x4));
+    const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5));
+    const int32x2_t vpsum1x6 = vadd_s32(vget_low_s32(vacc1x6), vget_high_s32(vacc1x6));
+    const int32x2_t vpsum1x7 = vadd_s32(vget_low_s32(vacc1x7), vget_high_s32(vacc1x7));
+    const int32x2_t vsum1x45 = vpadd_s32(vpsum1x4, vpsum1x5);
+    const int32x2_t vsum1x67 = vpadd_s32(vpsum1x6, vpsum1x7);
+    int32x4_t vacc1x4567 = vcombine_s32(vsum1x45, vsum1x67 );
+    const int32x2_t vpsum2x0 = vadd_s32(vget_low_s32(vacc2x0), vget_high_s32(vacc2x0));
+    const int32x2_t vpsum2x1 = vadd_s32(vget_low_s32(vacc2x1), vget_high_s32(vacc2x1));
+    const int32x2_t vpsum2x2 = vadd_s32(vget_low_s32(vacc2x2), vget_high_s32(vacc2x2));
+    const int32x2_t vpsum2x3 = vadd_s32(vget_low_s32(vacc2x3), vget_high_s32(vacc2x3));
+    const int32x2_t vsum2x01 = vpadd_s32(vpsum2x0, vpsum2x1);
+    const int32x2_t vsum2x23 = vpadd_s32(vpsum2x2, vpsum2x3);
+    int32x4_t vacc2x0123 = vcombine_s32(vsum2x01, vsum2x23 );
+    const int32x2_t vpsum2x4 = vadd_s32(vget_low_s32(vacc2x4), vget_high_s32(vacc2x4));
+    const int32x2_t vpsum2x5 = vadd_s32(vget_low_s32(vacc2x5), vget_high_s32(vacc2x5));
+    const int32x2_t vpsum2x6 = vadd_s32(vget_low_s32(vacc2x6), vget_high_s32(vacc2x6));
+    const int32x2_t vpsum2x7 = vadd_s32(vget_low_s32(vacc2x7), vget_high_s32(vacc2x7));
+    const int32x2_t vsum2x45 = vpadd_s32(vpsum2x4, vpsum2x5);
+    const int32x2_t vsum2x67 = vpadd_s32(vpsum2x6, vpsum2x7);
+    int32x4_t vacc2x4567 = vcombine_s32(vsum2x45, vsum2x67 );
+    const int32x2_t vpsum3x0 = vadd_s32(vget_low_s32(vacc3x0), vget_high_s32(vacc3x0));
+    const int32x2_t vpsum3x1 = vadd_s32(vget_low_s32(vacc3x1), vget_high_s32(vacc3x1));
+    const int32x2_t vpsum3x2 = vadd_s32(vget_low_s32(vacc3x2), vget_high_s32(vacc3x2));
+    const int32x2_t vpsum3x3 = vadd_s32(vget_low_s32(vacc3x3), vget_high_s32(vacc3x3));
+    const int32x2_t vsum3x01 = vpadd_s32(vpsum3x0, vpsum3x1);
+    const int32x2_t vsum3x23 = vpadd_s32(vpsum3x2, vpsum3x3);
+    int32x4_t vacc3x0123 = vcombine_s32(vsum3x01, vsum3x23 );
+    const int32x2_t vpsum3x4 = vadd_s32(vget_low_s32(vacc3x4), vget_high_s32(vacc3x4));
+    const int32x2_t vpsum3x5 = vadd_s32(vget_low_s32(vacc3x5), vget_high_s32(vacc3x5));
+    const int32x2_t vpsum3x6 = vadd_s32(vget_low_s32(vacc3x6), vget_high_s32(vacc3x6));
+    const int32x2_t vpsum3x7 = vadd_s32(vget_low_s32(vacc3x7), vget_high_s32(vacc3x7));
+    const int32x2_t vsum3x45 = vpadd_s32(vpsum3x4, vpsum3x5);
+    const int32x2_t vsum3x67 = vpadd_s32(vpsum3x6, vpsum3x7);
+    int32x4_t vacc3x4567 = vcombine_s32(vsum3x45, vsum3x67 );
+#endif
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+    vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+    vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+    vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+    vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+    vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+    vacc3x0123 = vqrdmulhq_s32(vacc3x0123, vmultiplier);
+    vacc3x4567 = vqrdmulhq_s32(vacc3x4567, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+    vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+    vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+    vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+    vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+    vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+    vacc3x0123 = vsraq_n_s32(vacc3x0123, vbicq_s32(vacc3x0123, vzero_shift_mask), 31);
+    vacc3x4567 = vsraq_n_s32(vacc3x4567, vbicq_s32(vacc3x4567, vzero_shift_mask), 31);
+
+    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+    vacc3x0123 = vrshlq_s32(vacc3x0123, vright_shift);
+    vacc3x4567 = vrshlq_s32(vacc3x4567, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
+    int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
+    int8x16_t vout2x01234567_3x01234567 = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc3x01234567);
+#else
+    const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+    const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+    const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+    const int16x8_t vacc3x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567)), voutput_zero_point);
+
+    int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
+    int8x16_t vout2x01234567_3x01234567 = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc3x01234567));
+#endif
+    const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+    const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    vout2x01234567_3x01234567 = vmaxq_s8(vout2x01234567_3x01234567, voutput_min);
+    vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);
+
+    vout2x01234567_3x01234567 = vminq_s8(vout2x01234567_3x01234567, voutput_max);
+    vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);
+
+    if (nc >= 8) {
+      vst1_s8(c3 + 0, vget_high_s8(vout2x01234567_3x01234567));
+      vst1_s8(c2 + 0, vget_low_s8(vout2x01234567_3x01234567));
+      vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
+      vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
+
+      c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        vst1q_lane_u32(__builtin_assume_aligned(c3, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 2); c3 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 0); c2 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+        vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+      }
+      if (nc & 2) {
+        vst1q_lane_u16(__builtin_assume_aligned(c3, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 4); c3 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 0); c2 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+        vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+        vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
+        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+      }
+      if (nc & 1) {
+        vst1q_lane_s8(c3, vout2x01234567_3x01234567, 8);
+        vst1q_lane_s8(c2, vout2x01234567_3x01234567, 0);
+        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/6x16c4-minmax-neondot.c b/src/qs8-igemm/gen/6x16c4-minmax-neondot.c
index 58a80c8..9b524a1 100644
--- a/src/qs8-igemm/gen/6x16c4-minmax-neondot.c
+++ b/src/qs8-igemm/gen/6x16c4-minmax-neondot.c
@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot(
@@ -40,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 4);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
@@ -129,14 +130,14 @@
         const int8x8_t va5x01234567 = vld1_s8(a5); a5 += 8;
 
         // Load a 8x16 block of weights.
-        const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb0123x89AB = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb0123xCDEF = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb4567x89AB = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb4567xCDEF = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
+        const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb0123x89AB = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb0123xCDEF = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb4567x89AB = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb4567xCDEF = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
 
         // Multiply-accumulate: 6x8 * 8x16 --> 6x16.
         vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb0123x0123, va0x01234567, 0);
@@ -190,7 +191,7 @@
 
         k -= 8 * sizeof(int8_t);
       }
-      // Handle up to 7 final positions of `k`
+      // Handle up to 4 final positions of `k`
       if XNN_UNLIKELY(k != 0) {
         // Load a 6x4 block of activations.
         const int8x8_t va0x01234567 = vld1_s8(a0);
@@ -201,10 +202,10 @@
         const int8x8_t va5x01234567 = vld1_s8(a5);
 
         // Load a 4x16 block of weights.
-        const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb0123x89AB = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb0123xCDEF = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
+        const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb0123x89AB = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb0123xCDEF = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
 
         // Multiply-accumulate: 6x4 * 4x16 --> 6x16.
         vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb0123x0123, va0x01234567, 0);
@@ -231,40 +232,6 @@
         vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb0123x4567, va5x01234567, 0);
         vacc5x89AB = vdotq_lane_s32(vacc5x89AB, vb0123x89AB, va5x01234567, 0);
         vacc5xCDEF = vdotq_lane_s32(vacc5xCDEF, vb0123xCDEF, va5x01234567, 0);
-
-        if (k > 4) {
-          // Load a 4x16 block of weights.
-          const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-          const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-          const int8x16_t vb4567x89AB = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-          const int8x16_t vb4567xCDEF = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-
-          // Multiply-accumulate: 6x4 * 4x16 --> 6x16.
-          vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
-          vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
-          vacc0x89AB = vdotq_lane_s32(vacc0x89AB, vb4567x89AB, va0x01234567, 1);
-          vacc0xCDEF = vdotq_lane_s32(vacc0xCDEF, vb4567xCDEF, va0x01234567, 1);
-          vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1);
-          vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1);
-          vacc1x89AB = vdotq_lane_s32(vacc1x89AB, vb4567x89AB, va1x01234567, 1);
-          vacc1xCDEF = vdotq_lane_s32(vacc1xCDEF, vb4567xCDEF, va1x01234567, 1);
-          vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1);
-          vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1);
-          vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb4567x89AB, va2x01234567, 1);
-          vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb4567xCDEF, va2x01234567, 1);
-          vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb4567x0123, va3x01234567, 1);
-          vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb4567x4567, va3x01234567, 1);
-          vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb4567x89AB, va3x01234567, 1);
-          vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb4567xCDEF, va3x01234567, 1);
-          vacc4x0123 = vdotq_lane_s32(vacc4x0123, vb4567x0123, va4x01234567, 1);
-          vacc4x4567 = vdotq_lane_s32(vacc4x4567, vb4567x4567, va4x01234567, 1);
-          vacc4x89AB = vdotq_lane_s32(vacc4x89AB, vb4567x89AB, va4x01234567, 1);
-          vacc4xCDEF = vdotq_lane_s32(vacc4xCDEF, vb4567xCDEF, va4x01234567, 1);
-          vacc5x0123 = vdotq_lane_s32(vacc5x0123, vb4567x0123, va5x01234567, 1);
-          vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb4567x4567, va5x01234567, 1);
-          vacc5x89AB = vdotq_lane_s32(vacc5x89AB, vb4567x89AB, va5x01234567, 1);
-          vacc5xCDEF = vdotq_lane_s32(vacc5xCDEF, vb4567xCDEF, va5x01234567, 1);
-        }
       }
       p -= 6 * sizeof(void*);
     } while (p != 0);
diff --git a/src/qs8-igemm/gen/6x8c4-minmax-neondot.c b/src/qs8-igemm/gen/6x8c4-minmax-neondot.c
index cf724cf..41db255 100644
--- a/src/qs8-igemm/gen/6x8c4-minmax-neondot.c
+++ b/src/qs8-igemm/gen/6x8c4-minmax-neondot.c
@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_6x8c4__neondot(
@@ -40,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 4);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
@@ -117,10 +118,10 @@
         const int8x8_t va5x01234567 = vld1_s8(a5); a5 += 8;
 
         // Load a 8x8 block of weights.
-        const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
+        const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
 
         // Multiply-accumulate: 6x8 * 8x8 --> 6x8.
         vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb0123x0123, va0x01234567, 0);
@@ -150,7 +151,7 @@
 
         k -= 8 * sizeof(int8_t);
       }
-      // Handle up to 7 final positions of `k`
+      // Handle up to 4 final positions of `k`
       if XNN_UNLIKELY(k != 0) {
         // Load a 6x4 block of activations.
         const int8x8_t va0x01234567 = vld1_s8(a0);
@@ -161,8 +162,8 @@
         const int8x8_t va5x01234567 = vld1_s8(a5);
 
         // Load a 4x8 block of weights.
-        const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
+        const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
 
         // Multiply-accumulate: 6x4 * 4x8 --> 6x8.
         vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb0123x0123, va0x01234567, 0);
@@ -177,26 +178,6 @@
         vacc4x4567 = vdotq_lane_s32(vacc4x4567, vb0123x4567, va4x01234567, 0);
         vacc5x0123 = vdotq_lane_s32(vacc5x0123, vb0123x0123, va5x01234567, 0);
         vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb0123x4567, va5x01234567, 0);
-
-        if (k > 4) {
-          // Load a 4x8 block of weights.
-          const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-          const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-
-          // Multiply-accumulate: 6x4 * 4x8 --> 6x8.
-          vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
-          vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
-          vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1);
-          vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1);
-          vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1);
-          vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1);
-          vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb4567x0123, va3x01234567, 1);
-          vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb4567x4567, va3x01234567, 1);
-          vacc4x0123 = vdotq_lane_s32(vacc4x0123, vb4567x0123, va4x01234567, 1);
-          vacc4x4567 = vdotq_lane_s32(vacc4x4567, vb4567x4567, va4x01234567, 1);
-          vacc5x0123 = vdotq_lane_s32(vacc5x0123, vb4567x0123, va5x01234567, 1);
-          vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb4567x4567, va5x01234567, 1);
-        }
       }
       p -= 6 * sizeof(void*);
     } while (p != 0);
diff --git a/src/qs8-igemm/gen/8x16c4-minmax-neondot.c b/src/qs8-igemm/gen/8x16c4-minmax-neondot.c
index bb7c0f1..cbab6ea 100644
--- a/src/qs8-igemm/gen/8x16c4-minmax-neondot.c
+++ b/src/qs8-igemm/gen/8x16c4-minmax-neondot.c
@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot(
@@ -40,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 4);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
@@ -155,14 +156,14 @@
         const int8x8_t va7x01234567 = vld1_s8(a7); a7 += 8;
 
         // Load a 8x16 block of weights.
-        const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb0123x89AB = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb0123xCDEF = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb4567x89AB = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb4567xCDEF = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
+        const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb0123x89AB = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb0123xCDEF = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb4567x89AB = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb4567xCDEF = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
 
         // Multiply-accumulate: 8x8 * 8x16 --> 8x16.
         vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb0123x0123, va0x01234567, 0);
@@ -232,7 +233,7 @@
 
         k -= 8 * sizeof(int8_t);
       }
-      // Handle up to 7 final positions of `k`
+      // Handle up to 4 final positions of `k`
       if XNN_UNLIKELY(k != 0) {
         // Load a 8x4 block of activations.
         const int8x8_t va0x01234567 = vld1_s8(a0);
@@ -245,10 +246,10 @@
         const int8x8_t va7x01234567 = vld1_s8(a7);
 
         // Load a 4x16 block of weights.
-        const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb0123x89AB = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb0123xCDEF = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
+        const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb0123x89AB = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb0123xCDEF = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
 
         // Multiply-accumulate: 8x4 * 4x16 --> 8x16.
         vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb0123x0123, va0x01234567, 0);
@@ -283,48 +284,6 @@
         vacc7x4567 = vdotq_lane_s32(vacc7x4567, vb0123x4567, va7x01234567, 0);
         vacc7x89AB = vdotq_lane_s32(vacc7x89AB, vb0123x89AB, va7x01234567, 0);
         vacc7xCDEF = vdotq_lane_s32(vacc7xCDEF, vb0123xCDEF, va7x01234567, 0);
-
-        if (k > 4) {
-          // Load a 4x16 block of weights.
-          const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-          const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-          const int8x16_t vb4567x89AB = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-          const int8x16_t vb4567xCDEF = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-
-          // Multiply-accumulate: 8x4 * 4x16 --> 8x16.
-          vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
-          vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
-          vacc0x89AB = vdotq_lane_s32(vacc0x89AB, vb4567x89AB, va0x01234567, 1);
-          vacc0xCDEF = vdotq_lane_s32(vacc0xCDEF, vb4567xCDEF, va0x01234567, 1);
-          vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1);
-          vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1);
-          vacc1x89AB = vdotq_lane_s32(vacc1x89AB, vb4567x89AB, va1x01234567, 1);
-          vacc1xCDEF = vdotq_lane_s32(vacc1xCDEF, vb4567xCDEF, va1x01234567, 1);
-          vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1);
-          vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1);
-          vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb4567x89AB, va2x01234567, 1);
-          vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb4567xCDEF, va2x01234567, 1);
-          vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb4567x0123, va3x01234567, 1);
-          vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb4567x4567, va3x01234567, 1);
-          vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb4567x89AB, va3x01234567, 1);
-          vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb4567xCDEF, va3x01234567, 1);
-          vacc4x0123 = vdotq_lane_s32(vacc4x0123, vb4567x0123, va4x01234567, 1);
-          vacc4x4567 = vdotq_lane_s32(vacc4x4567, vb4567x4567, va4x01234567, 1);
-          vacc4x89AB = vdotq_lane_s32(vacc4x89AB, vb4567x89AB, va4x01234567, 1);
-          vacc4xCDEF = vdotq_lane_s32(vacc4xCDEF, vb4567xCDEF, va4x01234567, 1);
-          vacc5x0123 = vdotq_lane_s32(vacc5x0123, vb4567x0123, va5x01234567, 1);
-          vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb4567x4567, va5x01234567, 1);
-          vacc5x89AB = vdotq_lane_s32(vacc5x89AB, vb4567x89AB, va5x01234567, 1);
-          vacc5xCDEF = vdotq_lane_s32(vacc5xCDEF, vb4567xCDEF, va5x01234567, 1);
-          vacc6x0123 = vdotq_lane_s32(vacc6x0123, vb4567x0123, va6x01234567, 1);
-          vacc6x4567 = vdotq_lane_s32(vacc6x4567, vb4567x4567, va6x01234567, 1);
-          vacc6x89AB = vdotq_lane_s32(vacc6x89AB, vb4567x89AB, va6x01234567, 1);
-          vacc6xCDEF = vdotq_lane_s32(vacc6xCDEF, vb4567xCDEF, va6x01234567, 1);
-          vacc7x0123 = vdotq_lane_s32(vacc7x0123, vb4567x0123, va7x01234567, 1);
-          vacc7x4567 = vdotq_lane_s32(vacc7x4567, vb4567x4567, va7x01234567, 1);
-          vacc7x89AB = vdotq_lane_s32(vacc7x89AB, vb4567x89AB, va7x01234567, 1);
-          vacc7xCDEF = vdotq_lane_s32(vacc7xCDEF, vb4567xCDEF, va7x01234567, 1);
-        }
       }
       p -= 8 * sizeof(void*);
     } while (p != 0);
diff --git a/src/qs8-igemm/gen/8x8c4-minmax-neondot.c b/src/qs8-igemm/gen/8x8c4-minmax-neondot.c
index 7bdc2f3..0d680cc 100644
--- a/src/qs8-igemm/gen/8x8c4-minmax-neondot.c
+++ b/src/qs8-igemm/gen/8x8c4-minmax-neondot.c
@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_8x8c4__neondot(
@@ -40,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 4);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
@@ -139,10 +140,10 @@
         const int8x8_t va7x01234567 = vld1_s8(a7); a7 += 8;
 
         // Load a 8x8 block of weights.
-        const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
+        const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
 
         // Multiply-accumulate: 8x8 * 8x8 --> 8x8.
         vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb0123x0123, va0x01234567, 0);
@@ -180,7 +181,7 @@
 
         k -= 8 * sizeof(int8_t);
       }
-      // Handle up to 7 final positions of `k`
+      // Handle up to 4 final positions of `k`
       if XNN_UNLIKELY(k != 0) {
         // Load a 8x4 block of activations.
         const int8x8_t va0x01234567 = vld1_s8(a0);
@@ -193,8 +194,8 @@
         const int8x8_t va7x01234567 = vld1_s8(a7);
 
         // Load a 4x8 block of weights.
-        const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-        const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
+        const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
+        const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
 
         // Multiply-accumulate: 8x4 * 4x8 --> 8x8.
         vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb0123x0123, va0x01234567, 0);
@@ -213,30 +214,6 @@
         vacc6x4567 = vdotq_lane_s32(vacc6x4567, vb0123x4567, va6x01234567, 0);
         vacc7x0123 = vdotq_lane_s32(vacc7x0123, vb0123x0123, va7x01234567, 0);
         vacc7x4567 = vdotq_lane_s32(vacc7x4567, vb0123x4567, va7x01234567, 0);
-
-        if (k > 4) {
-          // Load a 4x8 block of weights.
-          const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-          const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*)((const int8_t*)w + 16);
-
-          // Multiply-accumulate: 8x4 * 4x8 --> 8x8.
-          vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
-          vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
-          vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1);
-          vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1);
-          vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1);
-          vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1);
-          vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb4567x0123, va3x01234567, 1);
-          vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb4567x4567, va3x01234567, 1);
-          vacc4x0123 = vdotq_lane_s32(vacc4x0123, vb4567x0123, va4x01234567, 1);
-          vacc4x4567 = vdotq_lane_s32(vacc4x4567, vb4567x4567, va4x01234567, 1);
-          vacc5x0123 = vdotq_lane_s32(vacc5x0123, vb4567x0123, va5x01234567, 1);
-          vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb4567x4567, va5x01234567, 1);
-          vacc6x0123 = vdotq_lane_s32(vacc6x0123, vb4567x0123, va6x01234567, 1);
-          vacc6x4567 = vdotq_lane_s32(vacc6x4567, vb4567x4567, va6x01234567, 1);
-          vacc7x0123 = vdotq_lane_s32(vacc7x0123, vb4567x0123, va7x01234567, 1);
-          vacc7x4567 = vdotq_lane_s32(vacc7x4567, vb4567x4567, va7x01234567, 1);
-        }
       }
       p -= 8 * sizeof(void*);
     } while (p != 0);
diff --git a/src/qs8-igemm/neon-mull-addw-dup.c.in b/src/qs8-igemm/neon-mull-addw-dup.c.in
new file mode 100644
index 0000000..45f3ff4
--- /dev/null
+++ b/src/qs8-igemm/neon-mull-addw-dup.c.in
@@ -0,0 +1,321 @@
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+$assert NR % 8 == 0
+$assert 8 <= NR <= 16
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/gemm.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_${MR}x${NR}__neon_mull_addw_dup(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= ${MR});
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (${MR} * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  int8_t* c0 = c;
+  $for M in range(1, MR):
+    int8_t* c${M} = (int8_t*) ((uintptr_t) c${M-1} + cm_stride);
+    $if M % 2 == 0:
+      if XNN_UNPREDICTABLE(mr <= ${M}) {
+        c${M} = c${M-1};
+      }
+    $elif M + 1 == MR:
+      if XNN_UNPREDICTABLE(mr != ${M+1}) {
+        c${M} = c${M-1};
+      }
+    $else:
+      if XNN_UNPREDICTABLE(mr < ${M+1}) {
+        c${M} = c${M-1};
+      }
+
+  do {
+    $for N in range(0, NR, 4):
+      int32x4_t vacc0x${ABC[N:N+4]} = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+    $for M in range(1, MR):
+      $for N in range(0, NR, 4):
+        int32x4_t vacc${M}x${ABC[N:N+4]} = vacc0x${ABC[N:N+4]};
+
+    size_t p = ks;
+    do {
+      $for M in range(MR):
+        const int8_t* restrict a${M} = a[${M}];
+        if XNN_UNPREDICTABLE(a${M} != zero) {
+          a${M} = (const int8_t*) ((uintptr_t) a${M} + a_offset);
+        }
+      a += ${MR};
+
+      size_t k = kc;
+      while (k >= 8 * sizeof(int8_t)) {
+        $for M in range(MR):
+          const int8x8_t va${M} = vld1_s8(a${M}); a${M} += 8;
+
+        $for K in range(8):
+          $for N in range(0, NR, 8):
+            const int8x8_t vb${ABC[N:N+8]}c${K} = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+            $for M in range(MR):
+              const int16x8_t vprod${M}x${ABC[N:N+8]}c${K} = vmull_s8(vb${ABC[N:N+8]}c${K}, vdup_lane_s8(va${M}, ${K}));
+              vacc${M}x${ABC[N:N+4]} = vaddw_s16(vacc${M}x${ABC[N:N+4]}, vget_low_s16(vprod${M}x${ABC[N:N+8]}c${K}));
+              vacc${M}x${ABC[N+4:N+8]} = vaddw_s16(vacc${M}x${ABC[N+4:N+8]}, vget_high_s16(vprod${M}x${ABC[N:N+8]}c${K}));
+
+        k -= 8 * sizeof(int8_t);
+      }
+      if XNN_UNLIKELY(k != 0) {
+        $for M in range(MR):
+          const int8x8_t va${M} = vld1_s8(a${M}); a${M} = (const int8_t*) ((uintptr_t) a${M} + k);
+
+        $for N in range(0, NR, 8):
+          const int8x8_t vb${ABC[N:N+8]}c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        $for M in range(MR):
+          $for N in range(0, NR, 8):
+            const int16x8_t vprod${M}x${ABC[N:N+8]}c0 = vmull_s8(vb${ABC[N:N+8]}c0, vdup_lane_s8(va${M}, 0));
+            vacc${M}x${ABC[N:N+4]} = vaddw_s16(vacc${M}x${ABC[N:N+4]}, vget_low_s16(vprod${M}x${ABC[N:N+8]}c0));
+            vacc${M}x${ABC[N+4:N+8]} = vaddw_s16(vacc${M}x${ABC[N+4:N+8]}, vget_high_s16(vprod${M}x${ABC[N:N+8]}c0));
+
+        if (k >= 2 * sizeof(int8_t)) {
+          $for N in range(0, NR, 8):
+            const int8x8_t vb${ABC[N:N+8]}c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          $for M in range(MR):
+            $for N in range(0, NR, 8):
+              const int16x8_t vprod${M}x${ABC[N:N+8]}c1 = vmull_s8(vb${ABC[N:N+8]}c1, vdup_lane_s8(va${M}, 1));
+              vacc${M}x${ABC[N:N+4]} = vaddw_s16(vacc${M}x${ABC[N:N+4]}, vget_low_s16(vprod${M}x${ABC[N:N+8]}c1));
+              vacc${M}x${ABC[N+4:N+8]} = vaddw_s16(vacc${M}x${ABC[N+4:N+8]}, vget_high_s16(vprod${M}x${ABC[N:N+8]}c1));
+
+          if (k > 2 * sizeof(int8_t)) {
+            $for N in range(0, NR, 8):
+              const int8x8_t vb${ABC[N:N+8]}c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+            $for M in range(MR):
+              $for N in range(0, NR, 8):
+                const int16x8_t vprod${M}x${ABC[N:N+8]}c2 = vmull_s8(vb${ABC[N:N+8]}c2, vdup_lane_s8(va${M}, 2));
+                vacc${M}x${ABC[N:N+4]} = vaddw_s16(vacc${M}x${ABC[N:N+4]}, vget_low_s16(vprod${M}x${ABC[N:N+8]}c2));
+                vacc${M}x${ABC[N+4:N+8]} = vaddw_s16(vacc${M}x${ABC[N+4:N+8]}, vget_high_s16(vprod${M}x${ABC[N:N+8]}c2));
+
+            if (k >= 4 * sizeof(int8_t)) {
+              $for N in range(0, NR, 8):
+                const int8x8_t vb${ABC[N:N+8]}c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+              $for M in range(MR):
+                $for N in range(0, NR, 8):
+                  const int16x8_t vprod${M}x${ABC[N:N+8]}c3 = vmull_s8(vb${ABC[N:N+8]}c3, vdup_lane_s8(va${M}, 3));
+                  vacc${M}x${ABC[N:N+4]} = vaddw_s16(vacc${M}x${ABC[N:N+4]}, vget_low_s16(vprod${M}x${ABC[N:N+8]}c3));
+                  vacc${M}x${ABC[N+4:N+8]} = vaddw_s16(vacc${M}x${ABC[N+4:N+8]}, vget_high_s16(vprod${M}x${ABC[N:N+8]}c3));
+
+              if (k > 4 * sizeof(int8_t)) {
+                $for N in range(0, NR, 8):
+                  const int8x8_t vb${ABC[N:N+8]}c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+                $for M in range(MR):
+                  $for N in range(0, NR, 8):
+                    const int16x8_t vprod${M}x${ABC[N:N+8]}c4 = vmull_s8(vb${ABC[N:N+8]}c4, vdup_lane_s8(va${M}, 4));
+                    vacc${M}x${ABC[N:N+4]} = vaddw_s16(vacc${M}x${ABC[N:N+4]}, vget_low_s16(vprod${M}x${ABC[N:N+8]}c4));
+                    vacc${M}x${ABC[N+4:N+8]} = vaddw_s16(vacc${M}x${ABC[N+4:N+8]}, vget_high_s16(vprod${M}x${ABC[N:N+8]}c4));
+
+                if (k >= 6 * sizeof(int8_t)) {
+                  $for N in range(0, NR, 8):
+                    const int8x8_t vb${ABC[N:N+8]}c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+                  $for M in range(MR):
+                    $for N in range(0, NR, 8):
+                      const int16x8_t vprod${M}x${ABC[N:N+8]}c5 = vmull_s8(vb${ABC[N:N+8]}c5, vdup_lane_s8(va${M}, 5));
+                      vacc${M}x${ABC[N:N+4]} = vaddw_s16(vacc${M}x${ABC[N:N+4]}, vget_low_s16(vprod${M}x${ABC[N:N+8]}c5));
+                      vacc${M}x${ABC[N+4:N+8]} = vaddw_s16(vacc${M}x${ABC[N+4:N+8]}, vget_high_s16(vprod${M}x${ABC[N:N+8]}c5));
+
+                  if (k > 6 * sizeof(int8_t)) {
+                    $for N in range(0, NR, 8):
+                      const int8x8_t vb${ABC[N:N+8]}c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+                    $for M in range(MR):
+                      $for N in range(0, NR, 8):
+                        const int16x8_t vprod${M}x${ABC[N:N+8]}c6 = vmull_s8(vb${ABC[N:N+8]}c6, vdup_lane_s8(va${M}, 6));
+                        vacc${M}x${ABC[N:N+4]} = vaddw_s16(vacc${M}x${ABC[N:N+4]}, vget_low_s16(vprod${M}x${ABC[N:N+8]}c6));
+                        vacc${M}x${ABC[N+4:N+8]} = vaddw_s16(vacc${M}x${ABC[N+4:N+8]}, vget_high_s16(vprod${M}x${ABC[N:N+8]}c6));
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+      p -= ${MR} * sizeof(void*);
+    } while (p != 0);
+
+    const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
+    $for M in range(MR):
+      $for N in range(0, NR, 4):
+        vacc${M}x${ABC[N:N+4]} = vqrdmulhq_s32(vacc${M}x${ABC[N:N+4]}, vmultiplier);
+
+    const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+    const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+    $for M in range(MR):
+      $for N in range(0, NR, 4):
+        vacc${M}x${ABC[N:N+4]} = vsraq_n_s32(vacc${M}x${ABC[N:N+4]}, vbicq_s32(vacc${M}x${ABC[N:N+4]}, vzero_shift_mask), 31);
+
+    $for M in range(MR):
+      $for N in range(0, NR, 4):
+        vacc${M}x${ABC[N:N+4]} = vrshlq_s32(vacc${M}x${ABC[N:N+4]}, vright_shift);
+
+    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+    $for M in range(MR):
+      $for N in range(0, NR, 8):
+        const int16x8_t vacc${M}x${ABC[N:N+8]} = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc${M}x${ABC[N:N+4]}), vacc${M}x${ABC[N+4:N+8]}), voutput_zero_point);
+
+    $for M in range(MR):
+      $for N in range(0, NR, 16):
+        $if N + 8 < NR:
+          int8x16_t vout${M}x${ABC[N:N+16]} = vqmovn_high_s16(vqmovn_s16(vacc${M}x${ABC[N:N+8]}), vacc${M}x${ABC[N+8:N+16]});
+        $elif M % 2 == 1:
+          int8x16_t vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vqmovn_high_s16(vqmovn_s16(vacc${M-1}x${ABC[N:N+8]}), vacc${M}x${ABC[N:N+8]});
+        $elif M + 1 == MR:
+          int8x8_t vout${M}x${ABC[N:N+8]} = vqmovn_s16(vacc${M}x${ABC[N:N+8]});
+#else
+    $for M in range(MR):
+      $for N in range(0, NR, 8):
+        const int16x8_t vacc${M}x${ABC[N:N+8]} = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc${M}x${ABC[N:N+4]}), vqmovn_s32(vacc${M}x${ABC[N+4:N+8]})), voutput_zero_point);
+
+    $for M in range(MR):
+      $for N in range(0, NR, 16):
+        $if N + 8 < NR:
+          int8x16_t vout${M}x${ABC[N:N+16]} = vcombine_s8(vqmovn_s16(vacc${M}x${ABC[N:N+8]}), vqmovn_s16(vacc${M}x${ABC[N+8:N+16]}));
+        $elif M % 2 == 1:
+          int8x16_t vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vcombine_s8(vqmovn_s16(vacc${M-1}x${ABC[N:N+8]}), vqmovn_s16(vacc${M}x${ABC[N:N+8]}));
+        $elif M + 1 == MR:
+          int8x8_t vout${M}x${ABC[N:N+8]} = vqmovn_s16(vacc${M}x${ABC[N:N+8]});
+#endif
+    $if NR == 8 and MR == 1:
+      const int8x8_t voutput_min = vld1_dup_s8(&params->neon.output_min);
+      const int8x8_t voutput_max = vld1_dup_s8(&params->neon.output_max);
+    $else:
+      const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
+      const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
+
+    $for M in reversed(range(MR)):
+      $for N in range(0, NR, 16):
+        $if N + 8 < NR:
+          vout${M}x${ABC[N:N+16]} = vmaxq_s8(vout${M}x${ABC[N:N+16]}, voutput_min);
+        $elif M % 2 == 1:
+          vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vmaxq_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}, voutput_min);
+        $elif M + 1 == MR:
+          $if NR == 8 and MR == 1:
+            vout${M}x${ABC[N:N+8]} = vmax_s8(vout${M}x${ABC[N:N+8]}, voutput_min);
+          $else:
+            vout${M}x${ABC[N:N+8]} = vmax_s8(vout${M}x${ABC[N:N+8]}, vget_low_s8(voutput_min));
+
+    $for M in reversed(range(MR)):
+      $for N in range(0, NR, 16):
+        $if N + 8 < NR:
+          vout${M}x${ABC[N:N+16]} = vminq_s8(vout${M}x${ABC[N:N+16]}, voutput_max);
+        $elif M % 2 == 1:
+          vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vminq_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}, voutput_max);
+        $elif M + 1 == MR:
+          $if NR == 8 and MR == 1:
+            vout${M}x${ABC[N:N+8]} = vmin_s8(vout${M}x${ABC[N:N+8]}, voutput_max);
+          $else:
+            vout${M}x${ABC[N:N+8]} = vmin_s8(vout${M}x${ABC[N:N+8]}, vget_low_s8(voutput_max));
+
+    if (nc >= ${NR}) {
+      $for M in reversed(range(MR)):
+        $for N in range(0, NR, 16):
+          $if N + 8 < NR:
+            vst1q_s8(c${M} + ${N}, vout${M}x${ABC[N:N+16]});
+          $elif M % 2 == 1:
+            vst1_s8(c${M} + ${N}, vget_high_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}));
+            vst1_s8(c${M-1} + ${N}, vget_low_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}));
+          $elif M + 1 == MR:
+            vst1_s8(c${M} + ${N}, vout${M}x${ABC[N:N+8]});
+
+      $for M in reversed(range(MR)):
+        c${M} = (int8_t*) ((uintptr_t) c${M} + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= ${NR};
+    } else {
+      $if NR == 16:
+        $for M in range(MR):
+          $if M % 2 == 1:
+            int8x16_t vout${M-1}x01234567_${M}x01234567 = vcombine_s8(vget_low_s8(vout${M-1}x0123456789ABCDEF), vget_low_s8(vout${M}x0123456789ABCDEF));
+          $elif M + 1 == MR:
+            int8x8_t vout${M}x01234567 = vget_low_s8(vout${M}x0123456789ABCDEF);
+        if (nc & 8) {
+          $for M in reversed(range(MR)):
+            $if M % 2 == 1:
+              vst1_s8(c${M}, vget_high_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]})); c${M} += 8;
+              vst1_s8(c${M-1}, vget_low_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]})); c${M-1} += 8;
+            $elif M + 1 == MR:
+              vst1_s8(c${M}, vout${M}x${ABC[N:N+8]}); c${M} += 8;
+          $for M in reversed(range(MR)):
+            $if M % 2 == 1:
+              vout${M-1}x01234567_${M}x01234567 = vcombine_s8(vget_high_s8(vout${M-1}x0123456789ABCDEF), vget_high_s8(vout${M}x0123456789ABCDEF));
+            $elif M + 1 == MR:
+              vout${M}x01234567 = vget_high_s8(vout${M}x0123456789ABCDEF);
+        }
+      if (nc & 4) {
+        $for M in reversed(range(MR)):
+          $if M % 2 == 1:
+            vst1q_lane_u32(__builtin_assume_aligned(c${M}, 1), vreinterpretq_u32_s8(vout${M-1}x01234567_${M}x01234567), 2); c${M} += 4;
+            vst1q_lane_u32(__builtin_assume_aligned(c${M-1}, 1), vreinterpretq_u32_s8(vout${M-1}x01234567_${M}x01234567), 0); c${M-1} += 4;
+          $elif M + 1 == MR:
+            vst1_lane_u32(__builtin_assume_aligned(c${M}, 1), vreinterpret_u32_s8(vout${M}x01234567), 0); c${M} += 4;
+        $for M in reversed(range(MR)):
+          $if M % 2 == 1:
+            vout${M-1}x01234567_${M}x01234567 = vextq_s8(vout${M-1}x01234567_${M}x01234567, vout${M-1}x01234567_${M}x01234567, 4);
+          $elif M + 1 == MR:
+            vout${M}x01234567 = vext_s8(vout${M}x01234567, vout${M}x01234567, 4);
+      }
+      if (nc & 2) {
+        $for M in reversed(range(MR)):
+          $if M % 2 == 1:
+            vst1q_lane_u16(__builtin_assume_aligned(c${M}, 1), vreinterpretq_u16_s8(vout${M-1}x01234567_${M}x01234567), 4); c${M} += 2;
+            vst1q_lane_u16(__builtin_assume_aligned(c${M-1}, 1), vreinterpretq_u16_s8(vout${M-1}x01234567_${M}x01234567), 0); c${M-1} += 2;
+          $elif M + 1 == MR:
+            vst1_lane_u16(__builtin_assume_aligned(c${M}, 1), vreinterpret_u16_s8(vout${M}x01234567), 0); c${M} += 2;
+        $for M in reversed(range(MR)):
+          $if M % 2 == 1:
+            vout${M-1}x01234567_${M}x01234567 = vextq_s8(vout${M-1}x01234567_${M}x01234567, vout${M-1}x01234567_${M}x01234567, 2);
+          $elif M + 1 == MR:
+            vout${M}x01234567 = vext_s8(vout${M}x01234567, vout${M}x01234567, 2);
+      }
+      if (nc & 1) {
+        $for M in reversed(range(MR)):
+          $if M % 2 == 1:
+            vst1q_lane_s8(c${M}, vout${M-1}x01234567_${M}x01234567, 8);
+            vst1q_lane_s8(c${M-1}, vout${M-1}x01234567_${M}x01234567, 0);
+          $elif M + 1 == MR:
+            vst1_lane_s8(c${M}, vout${M}x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qu8-gemm/2x4c8-minmax-sse2.c b/src/qu8-gemm/2x4c8-minmax-sse2.c
index 371fe74..e82204d 100644
--- a/src/qu8-gemm/2x4c8-minmax-sse2.c
+++ b/src/qu8-gemm/2x4c8-minmax-sse2.c
@@ -48,7 +48,12 @@
   assert(mr <= 2);
   assert(nc != 0);
   assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const uint8_t* a0 = a;
   uint8_t* c0 = c;
   const uint8_t* a1 = (const uint8_t*) ((uintptr_t) a0 + a_stride);
@@ -58,7 +63,6 @@
     c1 = c0;
   }
 
-  const size_t kc_stride = round_up_po2(kc, 8);
   const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->sse2.kernel_zero_point);
 
   do {
@@ -173,8 +177,8 @@
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(_mm_srli_epi64(vout, 32));
 
-      a0 = (const uint8_t*) ((uintptr_t) a0 - kc_stride);
-      a1 = (const uint8_t*) ((uintptr_t) a1 - kc_stride);
+      a0 = (const uint8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const uint8_t*) ((uintptr_t) a1 - kc);
 
       c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride);
diff --git a/src/qu8-gemm/4x4c2-minmax-sse2.c b/src/qu8-gemm/4x4c2-minmax-sse2.c
index 45afdd2..6f687ce 100644
--- a/src/qu8-gemm/4x4c2-minmax-sse2.c
+++ b/src/qu8-gemm/4x4c2-minmax-sse2.c
@@ -11,6 +11,7 @@
 #include <immintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qu8_gemm_minmax_ukernel_4x4c2__sse2(
@@ -29,7 +30,12 @@
   assert(mr <= 4);
   assert(nc != 0);
   assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const uint8_t* a0 = a;
   uint8_t* c0 = c;
   const uint8_t* a1 = (const uint8_t*) ((uintptr_t) a0 + a_stride);
@@ -181,21 +187,6 @@
             _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
           vacc3x0123 = _mm_add_epi32(vacc3x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-          if (k > 6 * sizeof(uint8_t)) {
-            const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-            w = (const void*) ((uintptr_t) w + 8);
-            const __m128i vxb3 = _mm_sub_epi16(_mm_unpacklo_epi8(vb3, vzero), vb_zero_point);
-
-            vacc0x0123 = _mm_add_epi32(vacc0x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc1x0123 = _mm_add_epi32(vacc1x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc2x0123 = _mm_add_epi32(vacc2x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc3x0123 = _mm_add_epi32(vacc3x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-          }
         }
       }
     }
diff --git a/src/qu8-gemm/MRxNRc4-minmax-scalar.c.in b/src/qu8-gemm/MRxNRc4-minmax-scalar.c.in
deleted file mode 100644
index a66a03e..0000000
--- a/src/qu8-gemm/MRxNRc4-minmax-scalar.c.in
+++ /dev/null
@@ -1,167 +0,0 @@
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <xnnpack/gemm.h>
-
-#include <xnnpack/scalar-utils.h>
-
-// This kernel is a scalar model for a kernel using ARMv8.2 dot-product
-// instructions.
-//
-// XNN_DISABLE_TSAN is used because this kernel reads up to 3 bytes past the
-// bounds of the `a` matrix region, which may be a race condition with
-// another thread. We deem this acceptable because the values that are
-// read out of bounds do not affect the result, and the the compiler can't know
-// about this undefined behavior.
-void xnn_qu8_gemm_minmax_ukernel_${MR}x${NR}c4__scalar(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    const uint8_t* restrict a,
-    size_t a_stride,
-    const void* restrict w,
-    uint8_t* restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    const union xnn_qu8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN {
-  assert(mr != 0);
-  assert(mr <= ${MR});
-  assert(nc != 0);
-  assert(kc != 0);
-
-  const uint8_t* a0 = a;
-  uint8_t* c0 = c;
-  $for M in range(1, MR):
-    const uint8_t* a${M} = (const uint8_t*) ((uintptr_t) a${M-1} + a_stride);
-    uint8_t* c${M} = (uint8_t*) ((uintptr_t) c${M-1} + cm_stride);
-    $if M % 2 == 0:
-      if XNN_UNPREDICTABLE(mr <= ${M}) {
-        a${M} = a${M-1};
-        c${M} = c${M-1};
-      }
-    $elif M + 1 == MR:
-      if XNN_UNPREDICTABLE(mr != ${M+1}) {
-        a${M} = a${M-1};
-        c${M} = c${M-1};
-      }
-    $else:
-      if XNN_UNPREDICTABLE(mr < ${M+1}) {
-        a${M} = a${M-1};
-        c${M} = c${M-1};
-      }
-
-  const int32_t vb_zero_point = params->scalar.kernel_zero_point;
-
-  // Loop over groups of ${NR} columns.
-  do {
-    // `vaccMN` is the accumulator at row `M`, column `N`.
-    // Initialize accumulators with bias. ${NR} bias values are loaded from the
-    // weight matrix, at the start of the group of ${NR} columns.
-    $for N in range(NR):
-      int32_t bias${N} = ((const int32_t*)w)[${N}];
-      $for M in range(MR):
-        int32_t vacc${M}${N} = bias${N};
-
-    w = (const void*)((uintptr_t)w + ${NR} * sizeof(int32_t));
-
-    // Inner accumulation loop along the ${NR} columns.
-    // Handle 4 rows at each iteration: this is key to modelling what an
-    // actual kernel using ARMv8.2 dot-product instructions would look like.
-    size_t k = 0;
-    while (k < kc) {
-      // Load a ${MR}x4 block of activations, and compute sums along rows.
-      $for M in range(MR):
-        int16_t vasum${M} = 0;
-        $for K in range(4):
-          int32_t va${M}${K} = *a${M}++;
-          vasum${M} += (int16_t) va${M}${K};
-
-      // Load a 4x${NR} block of weights.
-      $for N in range(NR):
-        $for K in range(4):
-          int32_t vb${K}${N} = (int32_t) ((const uint8_t*)w)[${K}];
-
-        w = (const void*)((uintptr_t)w + 4 * sizeof(uint8_t));
-
-      // Multiply-accumulate: ${MR}x4 * 4x${NR} --> ${MR}x${NR}. The inner size 4 here means
-      // we're computing 4D dot-products, which makes this a model for
-      // a ARMv8.2 dot-product kernel.
-      $for M in range(MR):
-        $for N in range(NR):
-          $for K in range(4):
-            vacc${M}${N} += va${M}${K} * vb${K}${N};
-          vacc${M}${N} -= ((int32_t) vasum${M}) * vb_zero_point;
-
-      k += 4 * sizeof(uint8_t);
-    }
-    // End of accumulation loop. The variable `k` contains the amount by which
-    // we advanced the `va` pointers, so we rewind by this amount now.
-    $for M in range(MR):
-      a${M} = (const uint8_t*)((uintptr_t)a${M} - k);
-
-    // Post-accumulation work
-
-    const int32_t vmultiplier = params->scalar.multiplier;
-    const int64_t vq31rounding = INT64_C(0x40000000);
-    const int32_t vremainder_mask = params->scalar.remainder_mask;
-    const uint32_t vshift = params->scalar.shift;
-    const int32_t vremainder_threshold = params->scalar.remainder_threshold;
-    const int32_t voutput_min = params->scalar.output_min_less_zero_point;
-    const int32_t voutput_max = params->scalar.output_max_less_zero_point;
-    const int32_t voutput_zero_point = params->scalar.output_zero_point;
-
-    $for M in range(MR):
-      $for N in range(NR):
-        const int64_t vproduct${M}${N} = (int64_t)vacc${M}${N} * (int64_t)vmultiplier;
-
-    $for M in range(MR):
-      $for N in range(NR):
-        const int32_t vq31product${M}${N} = (int32_t)(uint32_t)((uint64_t)(vproduct${M}${N} + vq31rounding) >> 31);
-
-    $for M in range(MR):
-      $for N in range(NR):
-        const int32_t vremainder${M}${N} = (vq31product${M}${N} & vremainder_mask) - (int32_t)(vq31product${M}${N} < 0);
-
-    $for M in range(MR):
-      $for N in range(NR):
-        int32_t vout${M}${N} = asr_s32(vq31product${M}${N}, vshift) + (int32_t)(vremainder${M}${N} > vremainder_threshold);
-
-    $for M in range(MR):
-      $for N in range(NR):
-        vout${M}${N} = vout${M}${N} < voutput_min ? voutput_min : vout${M}${N};
-
-    $for M in range(MR):
-      $for N in range(NR):
-        vout${M}${N} = vout${M}${N} > voutput_max ? voutput_max : vout${M}${N};
-
-    $for M in range(MR):
-      $for N in range(NR):
-        vout${M}${N} += voutput_zero_point;
-
-    if XNN_LIKELY (nc >= ${NR}) {
-      // Main case where there the ${NR} columns fit in the destination.
-      $for M in range(MR):
-        $for N in range(NR):
-          c${M}[${N}] = vout${M}${N};
-
-      // Advance to the next ${NR} columns.
-      $for M in range(MR):
-        c${M} = (uint8_t*)((uintptr_t)c${M} + cn_stride);
-
-      nc -= ${NR};
-    } else {
-      // Final case where not all of the ${NR} columns fit in the destination.
-      $for N in range(NR):
-        if (nc > ${N}) {
-          $for M in range(MR):
-            c${M}[${N}] = vout${M}${N};
-        }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/qu8-gemm/gen/12x4c4-minmax-scalar.c b/src/qu8-gemm/gen/12x4c4-minmax-scalar.c
deleted file mode 100644
index af6eb68..0000000
--- a/src/qu8-gemm/gen/12x4c4-minmax-scalar.c
+++ /dev/null
@@ -1,1055 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/qu8-gemm/MRxNRc4-minmax-scalar.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <xnnpack/gemm.h>
-
-#include <xnnpack/scalar-utils.h>
-
-// This kernel is a scalar model for a kernel using ARMv8.2 dot-product
-// instructions.
-//
-// XNN_DISABLE_TSAN is used because this kernel reads up to 3 bytes past the
-// bounds of the `a` matrix region, which may be a race condition with
-// another thread. We deem this acceptable because the values that are
-// read out of bounds do not affect the result, and the the compiler can't know
-// about this undefined behavior.
-void xnn_qu8_gemm_minmax_ukernel_12x4c4__scalar(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    const uint8_t* restrict a,
-    size_t a_stride,
-    const void* restrict w,
-    uint8_t* restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    const union xnn_qu8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN {
-  assert(mr != 0);
-  assert(mr <= 12);
-  assert(nc != 0);
-  assert(kc != 0);
-
-  const uint8_t* a0 = a;
-  uint8_t* c0 = c;
-  const uint8_t* a1 = (const uint8_t*) ((uintptr_t) a0 + a_stride);
-  uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 2) {
-    a1 = a0;
-    c1 = c0;
-  }
-  const uint8_t* a2 = (const uint8_t*) ((uintptr_t) a1 + a_stride);
-  uint8_t* c2 = (uint8_t*) ((uintptr_t) c1 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 2) {
-    a2 = a1;
-    c2 = c1;
-  }
-  const uint8_t* a3 = (const uint8_t*) ((uintptr_t) a2 + a_stride);
-  uint8_t* c3 = (uint8_t*) ((uintptr_t) c2 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 4) {
-    a3 = a2;
-    c3 = c2;
-  }
-  const uint8_t* a4 = (const uint8_t*) ((uintptr_t) a3 + a_stride);
-  uint8_t* c4 = (uint8_t*) ((uintptr_t) c3 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 4) {
-    a4 = a3;
-    c4 = c3;
-  }
-  const uint8_t* a5 = (const uint8_t*) ((uintptr_t) a4 + a_stride);
-  uint8_t* c5 = (uint8_t*) ((uintptr_t) c4 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 6) {
-    a5 = a4;
-    c5 = c4;
-  }
-  const uint8_t* a6 = (const uint8_t*) ((uintptr_t) a5 + a_stride);
-  uint8_t* c6 = (uint8_t*) ((uintptr_t) c5 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 6) {
-    a6 = a5;
-    c6 = c5;
-  }
-  const uint8_t* a7 = (const uint8_t*) ((uintptr_t) a6 + a_stride);
-  uint8_t* c7 = (uint8_t*) ((uintptr_t) c6 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 8) {
-    a7 = a6;
-    c7 = c6;
-  }
-  const uint8_t* a8 = (const uint8_t*) ((uintptr_t) a7 + a_stride);
-  uint8_t* c8 = (uint8_t*) ((uintptr_t) c7 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 8) {
-    a8 = a7;
-    c8 = c7;
-  }
-  const uint8_t* a9 = (const uint8_t*) ((uintptr_t) a8 + a_stride);
-  uint8_t* c9 = (uint8_t*) ((uintptr_t) c8 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 10) {
-    a9 = a8;
-    c9 = c8;
-  }
-  const uint8_t* a10 = (const uint8_t*) ((uintptr_t) a9 + a_stride);
-  uint8_t* c10 = (uint8_t*) ((uintptr_t) c9 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 10) {
-    a10 = a9;
-    c10 = c9;
-  }
-  const uint8_t* a11 = (const uint8_t*) ((uintptr_t) a10 + a_stride);
-  uint8_t* c11 = (uint8_t*) ((uintptr_t) c10 + cm_stride);
-  if XNN_UNPREDICTABLE(mr != 12) {
-    a11 = a10;
-    c11 = c10;
-  }
-
-  const int32_t vb_zero_point = params->scalar.kernel_zero_point;
-
-  // Loop over groups of 4 columns.
-  do {
-    // `vaccMN` is the accumulator at row `M`, column `N`.
-    // Initialize accumulators with bias. 4 bias values are loaded from the
-    // weight matrix, at the start of the group of 4 columns.
-    int32_t bias0 = ((const int32_t*)w)[0];
-    int32_t vacc00 = bias0;
-    int32_t vacc10 = bias0;
-    int32_t vacc20 = bias0;
-    int32_t vacc30 = bias0;
-    int32_t vacc40 = bias0;
-    int32_t vacc50 = bias0;
-    int32_t vacc60 = bias0;
-    int32_t vacc70 = bias0;
-    int32_t vacc80 = bias0;
-    int32_t vacc90 = bias0;
-    int32_t vacc100 = bias0;
-    int32_t vacc110 = bias0;
-    int32_t bias1 = ((const int32_t*)w)[1];
-    int32_t vacc01 = bias1;
-    int32_t vacc11 = bias1;
-    int32_t vacc21 = bias1;
-    int32_t vacc31 = bias1;
-    int32_t vacc41 = bias1;
-    int32_t vacc51 = bias1;
-    int32_t vacc61 = bias1;
-    int32_t vacc71 = bias1;
-    int32_t vacc81 = bias1;
-    int32_t vacc91 = bias1;
-    int32_t vacc101 = bias1;
-    int32_t vacc111 = bias1;
-    int32_t bias2 = ((const int32_t*)w)[2];
-    int32_t vacc02 = bias2;
-    int32_t vacc12 = bias2;
-    int32_t vacc22 = bias2;
-    int32_t vacc32 = bias2;
-    int32_t vacc42 = bias2;
-    int32_t vacc52 = bias2;
-    int32_t vacc62 = bias2;
-    int32_t vacc72 = bias2;
-    int32_t vacc82 = bias2;
-    int32_t vacc92 = bias2;
-    int32_t vacc102 = bias2;
-    int32_t vacc112 = bias2;
-    int32_t bias3 = ((const int32_t*)w)[3];
-    int32_t vacc03 = bias3;
-    int32_t vacc13 = bias3;
-    int32_t vacc23 = bias3;
-    int32_t vacc33 = bias3;
-    int32_t vacc43 = bias3;
-    int32_t vacc53 = bias3;
-    int32_t vacc63 = bias3;
-    int32_t vacc73 = bias3;
-    int32_t vacc83 = bias3;
-    int32_t vacc93 = bias3;
-    int32_t vacc103 = bias3;
-    int32_t vacc113 = bias3;
-
-    w = (const void*)((uintptr_t)w + 4 * sizeof(int32_t));
-
-    // Inner accumulation loop along the 4 columns.
-    // Handle 4 rows at each iteration: this is key to modelling what an
-    // actual kernel using ARMv8.2 dot-product instructions would look like.
-    size_t k = 0;
-    while (k < kc) {
-      // Load a 12x4 block of activations, and compute sums along rows.
-      int16_t vasum0 = 0;
-      int32_t va00 = *a0++;
-      vasum0 += (int16_t) va00;
-      int32_t va01 = *a0++;
-      vasum0 += (int16_t) va01;
-      int32_t va02 = *a0++;
-      vasum0 += (int16_t) va02;
-      int32_t va03 = *a0++;
-      vasum0 += (int16_t) va03;
-      int16_t vasum1 = 0;
-      int32_t va10 = *a1++;
-      vasum1 += (int16_t) va10;
-      int32_t va11 = *a1++;
-      vasum1 += (int16_t) va11;
-      int32_t va12 = *a1++;
-      vasum1 += (int16_t) va12;
-      int32_t va13 = *a1++;
-      vasum1 += (int16_t) va13;
-      int16_t vasum2 = 0;
-      int32_t va20 = *a2++;
-      vasum2 += (int16_t) va20;
-      int32_t va21 = *a2++;
-      vasum2 += (int16_t) va21;
-      int32_t va22 = *a2++;
-      vasum2 += (int16_t) va22;
-      int32_t va23 = *a2++;
-      vasum2 += (int16_t) va23;
-      int16_t vasum3 = 0;
-      int32_t va30 = *a3++;
-      vasum3 += (int16_t) va30;
-      int32_t va31 = *a3++;
-      vasum3 += (int16_t) va31;
-      int32_t va32 = *a3++;
-      vasum3 += (int16_t) va32;
-      int32_t va33 = *a3++;
-      vasum3 += (int16_t) va33;
-      int16_t vasum4 = 0;
-      int32_t va40 = *a4++;
-      vasum4 += (int16_t) va40;
-      int32_t va41 = *a4++;
-      vasum4 += (int16_t) va41;
-      int32_t va42 = *a4++;
-      vasum4 += (int16_t) va42;
-      int32_t va43 = *a4++;
-      vasum4 += (int16_t) va43;
-      int16_t vasum5 = 0;
-      int32_t va50 = *a5++;
-      vasum5 += (int16_t) va50;
-      int32_t va51 = *a5++;
-      vasum5 += (int16_t) va51;
-      int32_t va52 = *a5++;
-      vasum5 += (int16_t) va52;
-      int32_t va53 = *a5++;
-      vasum5 += (int16_t) va53;
-      int16_t vasum6 = 0;
-      int32_t va60 = *a6++;
-      vasum6 += (int16_t) va60;
-      int32_t va61 = *a6++;
-      vasum6 += (int16_t) va61;
-      int32_t va62 = *a6++;
-      vasum6 += (int16_t) va62;
-      int32_t va63 = *a6++;
-      vasum6 += (int16_t) va63;
-      int16_t vasum7 = 0;
-      int32_t va70 = *a7++;
-      vasum7 += (int16_t) va70;
-      int32_t va71 = *a7++;
-      vasum7 += (int16_t) va71;
-      int32_t va72 = *a7++;
-      vasum7 += (int16_t) va72;
-      int32_t va73 = *a7++;
-      vasum7 += (int16_t) va73;
-      int16_t vasum8 = 0;
-      int32_t va80 = *a8++;
-      vasum8 += (int16_t) va80;
-      int32_t va81 = *a8++;
-      vasum8 += (int16_t) va81;
-      int32_t va82 = *a8++;
-      vasum8 += (int16_t) va82;
-      int32_t va83 = *a8++;
-      vasum8 += (int16_t) va83;
-      int16_t vasum9 = 0;
-      int32_t va90 = *a9++;
-      vasum9 += (int16_t) va90;
-      int32_t va91 = *a9++;
-      vasum9 += (int16_t) va91;
-      int32_t va92 = *a9++;
-      vasum9 += (int16_t) va92;
-      int32_t va93 = *a9++;
-      vasum9 += (int16_t) va93;
-      int16_t vasum10 = 0;
-      int32_t va100 = *a10++;
-      vasum10 += (int16_t) va100;
-      int32_t va101 = *a10++;
-      vasum10 += (int16_t) va101;
-      int32_t va102 = *a10++;
-      vasum10 += (int16_t) va102;
-      int32_t va103 = *a10++;
-      vasum10 += (int16_t) va103;
-      int16_t vasum11 = 0;
-      int32_t va110 = *a11++;
-      vasum11 += (int16_t) va110;
-      int32_t va111 = *a11++;
-      vasum11 += (int16_t) va111;
-      int32_t va112 = *a11++;
-      vasum11 += (int16_t) va112;
-      int32_t va113 = *a11++;
-      vasum11 += (int16_t) va113;
-
-      // Load a 4x4 block of weights.
-      int32_t vb00 = (int32_t) ((const uint8_t*)w)[0];
-      int32_t vb10 = (int32_t) ((const uint8_t*)w)[1];
-      int32_t vb20 = (int32_t) ((const uint8_t*)w)[2];
-      int32_t vb30 = (int32_t) ((const uint8_t*)w)[3];
-
-      w = (const void*)((uintptr_t)w + 4 * sizeof(uint8_t));
-      int32_t vb01 = (int32_t) ((const uint8_t*)w)[0];
-      int32_t vb11 = (int32_t) ((const uint8_t*)w)[1];
-      int32_t vb21 = (int32_t) ((const uint8_t*)w)[2];
-      int32_t vb31 = (int32_t) ((const uint8_t*)w)[3];
-
-      w = (const void*)((uintptr_t)w + 4 * sizeof(uint8_t));
-      int32_t vb02 = (int32_t) ((const uint8_t*)w)[0];
-      int32_t vb12 = (int32_t) ((const uint8_t*)w)[1];
-      int32_t vb22 = (int32_t) ((const uint8_t*)w)[2];
-      int32_t vb32 = (int32_t) ((const uint8_t*)w)[3];
-
-      w = (const void*)((uintptr_t)w + 4 * sizeof(uint8_t));
-      int32_t vb03 = (int32_t) ((const uint8_t*)w)[0];
-      int32_t vb13 = (int32_t) ((const uint8_t*)w)[1];
-      int32_t vb23 = (int32_t) ((const uint8_t*)w)[2];
-      int32_t vb33 = (int32_t) ((const uint8_t*)w)[3];
-
-      w = (const void*)((uintptr_t)w + 4 * sizeof(uint8_t));
-
-      // Multiply-accumulate: 12x4 * 4x4 --> 12x4. The inner size 4 here means
-      // we're computing 4D dot-products, which makes this a model for
-      // a ARMv8.2 dot-product kernel.
-      vacc00 += va00 * vb00;
-      vacc00 += va01 * vb10;
-      vacc00 += va02 * vb20;
-      vacc00 += va03 * vb30;
-      vacc00 -= ((int32_t) vasum0) * vb_zero_point;
-      vacc01 += va00 * vb01;
-      vacc01 += va01 * vb11;
-      vacc01 += va02 * vb21;
-      vacc01 += va03 * vb31;
-      vacc01 -= ((int32_t) vasum0) * vb_zero_point;
-      vacc02 += va00 * vb02;
-      vacc02 += va01 * vb12;
-      vacc02 += va02 * vb22;
-      vacc02 += va03 * vb32;
-      vacc02 -= ((int32_t) vasum0) * vb_zero_point;
-      vacc03 += va00 * vb03;
-      vacc03 += va01 * vb13;
-      vacc03 += va02 * vb23;
-      vacc03 += va03 * vb33;
-      vacc03 -= ((int32_t) vasum0) * vb_zero_point;
-      vacc10 += va10 * vb00;
-      vacc10 += va11 * vb10;
-      vacc10 += va12 * vb20;
-      vacc10 += va13 * vb30;
-      vacc10 -= ((int32_t) vasum1) * vb_zero_point;
-      vacc11 += va10 * vb01;
-      vacc11 += va11 * vb11;
-      vacc11 += va12 * vb21;
-      vacc11 += va13 * vb31;
-      vacc11 -= ((int32_t) vasum1) * vb_zero_point;
-      vacc12 += va10 * vb02;
-      vacc12 += va11 * vb12;
-      vacc12 += va12 * vb22;
-      vacc12 += va13 * vb32;
-      vacc12 -= ((int32_t) vasum1) * vb_zero_point;
-      vacc13 += va10 * vb03;
-      vacc13 += va11 * vb13;
-      vacc13 += va12 * vb23;
-      vacc13 += va13 * vb33;
-      vacc13 -= ((int32_t) vasum1) * vb_zero_point;
-      vacc20 += va20 * vb00;
-      vacc20 += va21 * vb10;
-      vacc20 += va22 * vb20;
-      vacc20 += va23 * vb30;
-      vacc20 -= ((int32_t) vasum2) * vb_zero_point;
-      vacc21 += va20 * vb01;
-      vacc21 += va21 * vb11;
-      vacc21 += va22 * vb21;
-      vacc21 += va23 * vb31;
-      vacc21 -= ((int32_t) vasum2) * vb_zero_point;
-      vacc22 += va20 * vb02;
-      vacc22 += va21 * vb12;
-      vacc22 += va22 * vb22;
-      vacc22 += va23 * vb32;
-      vacc22 -= ((int32_t) vasum2) * vb_zero_point;
-      vacc23 += va20 * vb03;
-      vacc23 += va21 * vb13;
-      vacc23 += va22 * vb23;
-      vacc23 += va23 * vb33;
-      vacc23 -= ((int32_t) vasum2) * vb_zero_point;
-      vacc30 += va30 * vb00;
-      vacc30 += va31 * vb10;
-      vacc30 += va32 * vb20;
-      vacc30 += va33 * vb30;
-      vacc30 -= ((int32_t) vasum3) * vb_zero_point;
-      vacc31 += va30 * vb01;
-      vacc31 += va31 * vb11;
-      vacc31 += va32 * vb21;
-      vacc31 += va33 * vb31;
-      vacc31 -= ((int32_t) vasum3) * vb_zero_point;
-      vacc32 += va30 * vb02;
-      vacc32 += va31 * vb12;
-      vacc32 += va32 * vb22;
-      vacc32 += va33 * vb32;
-      vacc32 -= ((int32_t) vasum3) * vb_zero_point;
-      vacc33 += va30 * vb03;
-      vacc33 += va31 * vb13;
-      vacc33 += va32 * vb23;
-      vacc33 += va33 * vb33;
-      vacc33 -= ((int32_t) vasum3) * vb_zero_point;
-      vacc40 += va40 * vb00;
-      vacc40 += va41 * vb10;
-      vacc40 += va42 * vb20;
-      vacc40 += va43 * vb30;
-      vacc40 -= ((int32_t) vasum4) * vb_zero_point;
-      vacc41 += va40 * vb01;
-      vacc41 += va41 * vb11;
-      vacc41 += va42 * vb21;
-      vacc41 += va43 * vb31;
-      vacc41 -= ((int32_t) vasum4) * vb_zero_point;
-      vacc42 += va40 * vb02;
-      vacc42 += va41 * vb12;
-      vacc42 += va42 * vb22;
-      vacc42 += va43 * vb32;
-      vacc42 -= ((int32_t) vasum4) * vb_zero_point;
-      vacc43 += va40 * vb03;
-      vacc43 += va41 * vb13;
-      vacc43 += va42 * vb23;
-      vacc43 += va43 * vb33;
-      vacc43 -= ((int32_t) vasum4) * vb_zero_point;
-      vacc50 += va50 * vb00;
-      vacc50 += va51 * vb10;
-      vacc50 += va52 * vb20;
-      vacc50 += va53 * vb30;
-      vacc50 -= ((int32_t) vasum5) * vb_zero_point;
-      vacc51 += va50 * vb01;
-      vacc51 += va51 * vb11;
-      vacc51 += va52 * vb21;
-      vacc51 += va53 * vb31;
-      vacc51 -= ((int32_t) vasum5) * vb_zero_point;
-      vacc52 += va50 * vb02;
-      vacc52 += va51 * vb12;
-      vacc52 += va52 * vb22;
-      vacc52 += va53 * vb32;
-      vacc52 -= ((int32_t) vasum5) * vb_zero_point;
-      vacc53 += va50 * vb03;
-      vacc53 += va51 * vb13;
-      vacc53 += va52 * vb23;
-      vacc53 += va53 * vb33;
-      vacc53 -= ((int32_t) vasum5) * vb_zero_point;
-      vacc60 += va60 * vb00;
-      vacc60 += va61 * vb10;
-      vacc60 += va62 * vb20;
-      vacc60 += va63 * vb30;
-      vacc60 -= ((int32_t) vasum6) * vb_zero_point;
-      vacc61 += va60 * vb01;
-      vacc61 += va61 * vb11;
-      vacc61 += va62 * vb21;
-      vacc61 += va63 * vb31;
-      vacc61 -= ((int32_t) vasum6) * vb_zero_point;
-      vacc62 += va60 * vb02;
-      vacc62 += va61 * vb12;
-      vacc62 += va62 * vb22;
-      vacc62 += va63 * vb32;
-      vacc62 -= ((int32_t) vasum6) * vb_zero_point;
-      vacc63 += va60 * vb03;
-      vacc63 += va61 * vb13;
-      vacc63 += va62 * vb23;
-      vacc63 += va63 * vb33;
-      vacc63 -= ((int32_t) vasum6) * vb_zero_point;
-      vacc70 += va70 * vb00;
-      vacc70 += va71 * vb10;
-      vacc70 += va72 * vb20;
-      vacc70 += va73 * vb30;
-      vacc70 -= ((int32_t) vasum7) * vb_zero_point;
-      vacc71 += va70 * vb01;
-      vacc71 += va71 * vb11;
-      vacc71 += va72 * vb21;
-      vacc71 += va73 * vb31;
-      vacc71 -= ((int32_t) vasum7) * vb_zero_point;
-      vacc72 += va70 * vb02;
-      vacc72 += va71 * vb12;
-      vacc72 += va72 * vb22;
-      vacc72 += va73 * vb32;
-      vacc72 -= ((int32_t) vasum7) * vb_zero_point;
-      vacc73 += va70 * vb03;
-      vacc73 += va71 * vb13;
-      vacc73 += va72 * vb23;
-      vacc73 += va73 * vb33;
-      vacc73 -= ((int32_t) vasum7) * vb_zero_point;
-      vacc80 += va80 * vb00;
-      vacc80 += va81 * vb10;
-      vacc80 += va82 * vb20;
-      vacc80 += va83 * vb30;
-      vacc80 -= ((int32_t) vasum8) * vb_zero_point;
-      vacc81 += va80 * vb01;
-      vacc81 += va81 * vb11;
-      vacc81 += va82 * vb21;
-      vacc81 += va83 * vb31;
-      vacc81 -= ((int32_t) vasum8) * vb_zero_point;
-      vacc82 += va80 * vb02;
-      vacc82 += va81 * vb12;
-      vacc82 += va82 * vb22;
-      vacc82 += va83 * vb32;
-      vacc82 -= ((int32_t) vasum8) * vb_zero_point;
-      vacc83 += va80 * vb03;
-      vacc83 += va81 * vb13;
-      vacc83 += va82 * vb23;
-      vacc83 += va83 * vb33;
-      vacc83 -= ((int32_t) vasum8) * vb_zero_point;
-      vacc90 += va90 * vb00;
-      vacc90 += va91 * vb10;
-      vacc90 += va92 * vb20;
-      vacc90 += va93 * vb30;
-      vacc90 -= ((int32_t) vasum9) * vb_zero_point;
-      vacc91 += va90 * vb01;
-      vacc91 += va91 * vb11;
-      vacc91 += va92 * vb21;
-      vacc91 += va93 * vb31;
-      vacc91 -= ((int32_t) vasum9) * vb_zero_point;
-      vacc92 += va90 * vb02;
-      vacc92 += va91 * vb12;
-      vacc92 += va92 * vb22;
-      vacc92 += va93 * vb32;
-      vacc92 -= ((int32_t) vasum9) * vb_zero_point;
-      vacc93 += va90 * vb03;
-      vacc93 += va91 * vb13;
-      vacc93 += va92 * vb23;
-      vacc93 += va93 * vb33;
-      vacc93 -= ((int32_t) vasum9) * vb_zero_point;
-      vacc100 += va100 * vb00;
-      vacc100 += va101 * vb10;
-      vacc100 += va102 * vb20;
-      vacc100 += va103 * vb30;
-      vacc100 -= ((int32_t) vasum10) * vb_zero_point;
-      vacc101 += va100 * vb01;
-      vacc101 += va101 * vb11;
-      vacc101 += va102 * vb21;
-      vacc101 += va103 * vb31;
-      vacc101 -= ((int32_t) vasum10) * vb_zero_point;
-      vacc102 += va100 * vb02;
-      vacc102 += va101 * vb12;
-      vacc102 += va102 * vb22;
-      vacc102 += va103 * vb32;
-      vacc102 -= ((int32_t) vasum10) * vb_zero_point;
-      vacc103 += va100 * vb03;
-      vacc103 += va101 * vb13;
-      vacc103 += va102 * vb23;
-      vacc103 += va103 * vb33;
-      vacc103 -= ((int32_t) vasum10) * vb_zero_point;
-      vacc110 += va110 * vb00;
-      vacc110 += va111 * vb10;
-      vacc110 += va112 * vb20;
-      vacc110 += va113 * vb30;
-      vacc110 -= ((int32_t) vasum11) * vb_zero_point;
-      vacc111 += va110 * vb01;
-      vacc111 += va111 * vb11;
-      vacc111 += va112 * vb21;
-      vacc111 += va113 * vb31;
-      vacc111 -= ((int32_t) vasum11) * vb_zero_point;
-      vacc112 += va110 * vb02;
-      vacc112 += va111 * vb12;
-      vacc112 += va112 * vb22;
-      vacc112 += va113 * vb32;
-      vacc112 -= ((int32_t) vasum11) * vb_zero_point;
-      vacc113 += va110 * vb03;
-      vacc113 += va111 * vb13;
-      vacc113 += va112 * vb23;
-      vacc113 += va113 * vb33;
-      vacc113 -= ((int32_t) vasum11) * vb_zero_point;
-
-      k += 4 * sizeof(uint8_t);
-    }
-    // End of accumulation loop. The variable `k` contains the amount by which
-    // we advanced the `va` pointers, so we rewind by this amount now.
-    a0 = (const uint8_t*)((uintptr_t)a0 - k);
-    a1 = (const uint8_t*)((uintptr_t)a1 - k);
-    a2 = (const uint8_t*)((uintptr_t)a2 - k);
-    a3 = (const uint8_t*)((uintptr_t)a3 - k);
-    a4 = (const uint8_t*)((uintptr_t)a4 - k);
-    a5 = (const uint8_t*)((uintptr_t)a5 - k);
-    a6 = (const uint8_t*)((uintptr_t)a6 - k);
-    a7 = (const uint8_t*)((uintptr_t)a7 - k);
-    a8 = (const uint8_t*)((uintptr_t)a8 - k);
-    a9 = (const uint8_t*)((uintptr_t)a9 - k);
-    a10 = (const uint8_t*)((uintptr_t)a10 - k);
-    a11 = (const uint8_t*)((uintptr_t)a11 - k);
-
-    // Post-accumulation work
-
-    const int32_t vmultiplier = params->scalar.multiplier;
-    const int64_t vq31rounding = INT64_C(0x40000000);
-    const int32_t vremainder_mask = params->scalar.remainder_mask;
-    const uint32_t vshift = params->scalar.shift;
-    const int32_t vremainder_threshold = params->scalar.remainder_threshold;
-    const int32_t voutput_min = params->scalar.output_min_less_zero_point;
-    const int32_t voutput_max = params->scalar.output_max_less_zero_point;
-    const int32_t voutput_zero_point = params->scalar.output_zero_point;
-
-    const int64_t vproduct00 = (int64_t)vacc00 * (int64_t)vmultiplier;
-    const int64_t vproduct01 = (int64_t)vacc01 * (int64_t)vmultiplier;
-    const int64_t vproduct02 = (int64_t)vacc02 * (int64_t)vmultiplier;
-    const int64_t vproduct03 = (int64_t)vacc03 * (int64_t)vmultiplier;
-    const int64_t vproduct10 = (int64_t)vacc10 * (int64_t)vmultiplier;
-    const int64_t vproduct11 = (int64_t)vacc11 * (int64_t)vmultiplier;
-    const int64_t vproduct12 = (int64_t)vacc12 * (int64_t)vmultiplier;
-    const int64_t vproduct13 = (int64_t)vacc13 * (int64_t)vmultiplier;
-    const int64_t vproduct20 = (int64_t)vacc20 * (int64_t)vmultiplier;
-    const int64_t vproduct21 = (int64_t)vacc21 * (int64_t)vmultiplier;
-    const int64_t vproduct22 = (int64_t)vacc22 * (int64_t)vmultiplier;
-    const int64_t vproduct23 = (int64_t)vacc23 * (int64_t)vmultiplier;
-    const int64_t vproduct30 = (int64_t)vacc30 * (int64_t)vmultiplier;
-    const int64_t vproduct31 = (int64_t)vacc31 * (int64_t)vmultiplier;
-    const int64_t vproduct32 = (int64_t)vacc32 * (int64_t)vmultiplier;
-    const int64_t vproduct33 = (int64_t)vacc33 * (int64_t)vmultiplier;
-    const int64_t vproduct40 = (int64_t)vacc40 * (int64_t)vmultiplier;
-    const int64_t vproduct41 = (int64_t)vacc41 * (int64_t)vmultiplier;
-    const int64_t vproduct42 = (int64_t)vacc42 * (int64_t)vmultiplier;
-    const int64_t vproduct43 = (int64_t)vacc43 * (int64_t)vmultiplier;
-    const int64_t vproduct50 = (int64_t)vacc50 * (int64_t)vmultiplier;
-    const int64_t vproduct51 = (int64_t)vacc51 * (int64_t)vmultiplier;
-    const int64_t vproduct52 = (int64_t)vacc52 * (int64_t)vmultiplier;
-    const int64_t vproduct53 = (int64_t)vacc53 * (int64_t)vmultiplier;
-    const int64_t vproduct60 = (int64_t)vacc60 * (int64_t)vmultiplier;
-    const int64_t vproduct61 = (int64_t)vacc61 * (int64_t)vmultiplier;
-    const int64_t vproduct62 = (int64_t)vacc62 * (int64_t)vmultiplier;
-    const int64_t vproduct63 = (int64_t)vacc63 * (int64_t)vmultiplier;
-    const int64_t vproduct70 = (int64_t)vacc70 * (int64_t)vmultiplier;
-    const int64_t vproduct71 = (int64_t)vacc71 * (int64_t)vmultiplier;
-    const int64_t vproduct72 = (int64_t)vacc72 * (int64_t)vmultiplier;
-    const int64_t vproduct73 = (int64_t)vacc73 * (int64_t)vmultiplier;
-    const int64_t vproduct80 = (int64_t)vacc80 * (int64_t)vmultiplier;
-    const int64_t vproduct81 = (int64_t)vacc81 * (int64_t)vmultiplier;
-    const int64_t vproduct82 = (int64_t)vacc82 * (int64_t)vmultiplier;
-    const int64_t vproduct83 = (int64_t)vacc83 * (int64_t)vmultiplier;
-    const int64_t vproduct90 = (int64_t)vacc90 * (int64_t)vmultiplier;
-    const int64_t vproduct91 = (int64_t)vacc91 * (int64_t)vmultiplier;
-    const int64_t vproduct92 = (int64_t)vacc92 * (int64_t)vmultiplier;
-    const int64_t vproduct93 = (int64_t)vacc93 * (int64_t)vmultiplier;
-    const int64_t vproduct100 = (int64_t)vacc100 * (int64_t)vmultiplier;
-    const int64_t vproduct101 = (int64_t)vacc101 * (int64_t)vmultiplier;
-    const int64_t vproduct102 = (int64_t)vacc102 * (int64_t)vmultiplier;
-    const int64_t vproduct103 = (int64_t)vacc103 * (int64_t)vmultiplier;
-    const int64_t vproduct110 = (int64_t)vacc110 * (int64_t)vmultiplier;
-    const int64_t vproduct111 = (int64_t)vacc111 * (int64_t)vmultiplier;
-    const int64_t vproduct112 = (int64_t)vacc112 * (int64_t)vmultiplier;
-    const int64_t vproduct113 = (int64_t)vacc113 * (int64_t)vmultiplier;
-
-    const int32_t vq31product00 = (int32_t)(uint32_t)((uint64_t)(vproduct00 + vq31rounding) >> 31);
-    const int32_t vq31product01 = (int32_t)(uint32_t)((uint64_t)(vproduct01 + vq31rounding) >> 31);
-    const int32_t vq31product02 = (int32_t)(uint32_t)((uint64_t)(vproduct02 + vq31rounding) >> 31);
-    const int32_t vq31product03 = (int32_t)(uint32_t)((uint64_t)(vproduct03 + vq31rounding) >> 31);
-    const int32_t vq31product10 = (int32_t)(uint32_t)((uint64_t)(vproduct10 + vq31rounding) >> 31);
-    const int32_t vq31product11 = (int32_t)(uint32_t)((uint64_t)(vproduct11 + vq31rounding) >> 31);
-    const int32_t vq31product12 = (int32_t)(uint32_t)((uint64_t)(vproduct12 + vq31rounding) >> 31);
-    const int32_t vq31product13 = (int32_t)(uint32_t)((uint64_t)(vproduct13 + vq31rounding) >> 31);
-    const int32_t vq31product20 = (int32_t)(uint32_t)((uint64_t)(vproduct20 + vq31rounding) >> 31);
-    const int32_t vq31product21 = (int32_t)(uint32_t)((uint64_t)(vproduct21 + vq31rounding) >> 31);
-    const int32_t vq31product22 = (int32_t)(uint32_t)((uint64_t)(vproduct22 + vq31rounding) >> 31);
-    const int32_t vq31product23 = (int32_t)(uint32_t)((uint64_t)(vproduct23 + vq31rounding) >> 31);
-    const int32_t vq31product30 = (int32_t)(uint32_t)((uint64_t)(vproduct30 + vq31rounding) >> 31);
-    const int32_t vq31product31 = (int32_t)(uint32_t)((uint64_t)(vproduct31 + vq31rounding) >> 31);
-    const int32_t vq31product32 = (int32_t)(uint32_t)((uint64_t)(vproduct32 + vq31rounding) >> 31);
-    const int32_t vq31product33 = (int32_t)(uint32_t)((uint64_t)(vproduct33 + vq31rounding) >> 31);
-    const int32_t vq31product40 = (int32_t)(uint32_t)((uint64_t)(vproduct40 + vq31rounding) >> 31);
-    const int32_t vq31product41 = (int32_t)(uint32_t)((uint64_t)(vproduct41 + vq31rounding) >> 31);
-    const int32_t vq31product42 = (int32_t)(uint32_t)((uint64_t)(vproduct42 + vq31rounding) >> 31);
-    const int32_t vq31product43 = (int32_t)(uint32_t)((uint64_t)(vproduct43 + vq31rounding) >> 31);
-    const int32_t vq31product50 = (int32_t)(uint32_t)((uint64_t)(vproduct50 + vq31rounding) >> 31);
-    const int32_t vq31product51 = (int32_t)(uint32_t)((uint64_t)(vproduct51 + vq31rounding) >> 31);
-    const int32_t vq31product52 = (int32_t)(uint32_t)((uint64_t)(vproduct52 + vq31rounding) >> 31);
-    const int32_t vq31product53 = (int32_t)(uint32_t)((uint64_t)(vproduct53 + vq31rounding) >> 31);
-    const int32_t vq31product60 = (int32_t)(uint32_t)((uint64_t)(vproduct60 + vq31rounding) >> 31);
-    const int32_t vq31product61 = (int32_t)(uint32_t)((uint64_t)(vproduct61 + vq31rounding) >> 31);
-    const int32_t vq31product62 = (int32_t)(uint32_t)((uint64_t)(vproduct62 + vq31rounding) >> 31);
-    const int32_t vq31product63 = (int32_t)(uint32_t)((uint64_t)(vproduct63 + vq31rounding) >> 31);
-    const int32_t vq31product70 = (int32_t)(uint32_t)((uint64_t)(vproduct70 + vq31rounding) >> 31);
-    const int32_t vq31product71 = (int32_t)(uint32_t)((uint64_t)(vproduct71 + vq31rounding) >> 31);
-    const int32_t vq31product72 = (int32_t)(uint32_t)((uint64_t)(vproduct72 + vq31rounding) >> 31);
-    const int32_t vq31product73 = (int32_t)(uint32_t)((uint64_t)(vproduct73 + vq31rounding) >> 31);
-    const int32_t vq31product80 = (int32_t)(uint32_t)((uint64_t)(vproduct80 + vq31rounding) >> 31);
-    const int32_t vq31product81 = (int32_t)(uint32_t)((uint64_t)(vproduct81 + vq31rounding) >> 31);
-    const int32_t vq31product82 = (int32_t)(uint32_t)((uint64_t)(vproduct82 + vq31rounding) >> 31);
-    const int32_t vq31product83 = (int32_t)(uint32_t)((uint64_t)(vproduct83 + vq31rounding) >> 31);
-    const int32_t vq31product90 = (int32_t)(uint32_t)((uint64_t)(vproduct90 + vq31rounding) >> 31);
-    const int32_t vq31product91 = (int32_t)(uint32_t)((uint64_t)(vproduct91 + vq31rounding) >> 31);
-    const int32_t vq31product92 = (int32_t)(uint32_t)((uint64_t)(vproduct92 + vq31rounding) >> 31);
-    const int32_t vq31product93 = (int32_t)(uint32_t)((uint64_t)(vproduct93 + vq31rounding) >> 31);
-    const int32_t vq31product100 = (int32_t)(uint32_t)((uint64_t)(vproduct100 + vq31rounding) >> 31);
-    const int32_t vq31product101 = (int32_t)(uint32_t)((uint64_t)(vproduct101 + vq31rounding) >> 31);
-    const int32_t vq31product102 = (int32_t)(uint32_t)((uint64_t)(vproduct102 + vq31rounding) >> 31);
-    const int32_t vq31product103 = (int32_t)(uint32_t)((uint64_t)(vproduct103 + vq31rounding) >> 31);
-    const int32_t vq31product110 = (int32_t)(uint32_t)((uint64_t)(vproduct110 + vq31rounding) >> 31);
-    const int32_t vq31product111 = (int32_t)(uint32_t)((uint64_t)(vproduct111 + vq31rounding) >> 31);
-    const int32_t vq31product112 = (int32_t)(uint32_t)((uint64_t)(vproduct112 + vq31rounding) >> 31);
-    const int32_t vq31product113 = (int32_t)(uint32_t)((uint64_t)(vproduct113 + vq31rounding) >> 31);
-
-    const int32_t vremainder00 = (vq31product00 & vremainder_mask) - (int32_t)(vq31product00 < 0);
-    const int32_t vremainder01 = (vq31product01 & vremainder_mask) - (int32_t)(vq31product01 < 0);
-    const int32_t vremainder02 = (vq31product02 & vremainder_mask) - (int32_t)(vq31product02 < 0);
-    const int32_t vremainder03 = (vq31product03 & vremainder_mask) - (int32_t)(vq31product03 < 0);
-    const int32_t vremainder10 = (vq31product10 & vremainder_mask) - (int32_t)(vq31product10 < 0);
-    const int32_t vremainder11 = (vq31product11 & vremainder_mask) - (int32_t)(vq31product11 < 0);
-    const int32_t vremainder12 = (vq31product12 & vremainder_mask) - (int32_t)(vq31product12 < 0);
-    const int32_t vremainder13 = (vq31product13 & vremainder_mask) - (int32_t)(vq31product13 < 0);
-    const int32_t vremainder20 = (vq31product20 & vremainder_mask) - (int32_t)(vq31product20 < 0);
-    const int32_t vremainder21 = (vq31product21 & vremainder_mask) - (int32_t)(vq31product21 < 0);
-    const int32_t vremainder22 = (vq31product22 & vremainder_mask) - (int32_t)(vq31product22 < 0);
-    const int32_t vremainder23 = (vq31product23 & vremainder_mask) - (int32_t)(vq31product23 < 0);
-    const int32_t vremainder30 = (vq31product30 & vremainder_mask) - (int32_t)(vq31product30 < 0);
-    const int32_t vremainder31 = (vq31product31 & vremainder_mask) - (int32_t)(vq31product31 < 0);
-    const int32_t vremainder32 = (vq31product32 & vremainder_mask) - (int32_t)(vq31product32 < 0);
-    const int32_t vremainder33 = (vq31product33 & vremainder_mask) - (int32_t)(vq31product33 < 0);
-    const int32_t vremainder40 = (vq31product40 & vremainder_mask) - (int32_t)(vq31product40 < 0);
-    const int32_t vremainder41 = (vq31product41 & vremainder_mask) - (int32_t)(vq31product41 < 0);
-    const int32_t vremainder42 = (vq31product42 & vremainder_mask) - (int32_t)(vq31product42 < 0);
-    const int32_t vremainder43 = (vq31product43 & vremainder_mask) - (int32_t)(vq31product43 < 0);
-    const int32_t vremainder50 = (vq31product50 & vremainder_mask) - (int32_t)(vq31product50 < 0);
-    const int32_t vremainder51 = (vq31product51 & vremainder_mask) - (int32_t)(vq31product51 < 0);
-    const int32_t vremainder52 = (vq31product52 & vremainder_mask) - (int32_t)(vq31product52 < 0);
-    const int32_t vremainder53 = (vq31product53 & vremainder_mask) - (int32_t)(vq31product53 < 0);
-    const int32_t vremainder60 = (vq31product60 & vremainder_mask) - (int32_t)(vq31product60 < 0);
-    const int32_t vremainder61 = (vq31product61 & vremainder_mask) - (int32_t)(vq31product61 < 0);
-    const int32_t vremainder62 = (vq31product62 & vremainder_mask) - (int32_t)(vq31product62 < 0);
-    const int32_t vremainder63 = (vq31product63 & vremainder_mask) - (int32_t)(vq31product63 < 0);
-    const int32_t vremainder70 = (vq31product70 & vremainder_mask) - (int32_t)(vq31product70 < 0);
-    const int32_t vremainder71 = (vq31product71 & vremainder_mask) - (int32_t)(vq31product71 < 0);
-    const int32_t vremainder72 = (vq31product72 & vremainder_mask) - (int32_t)(vq31product72 < 0);
-    const int32_t vremainder73 = (vq31product73 & vremainder_mask) - (int32_t)(vq31product73 < 0);
-    const int32_t vremainder80 = (vq31product80 & vremainder_mask) - (int32_t)(vq31product80 < 0);
-    const int32_t vremainder81 = (vq31product81 & vremainder_mask) - (int32_t)(vq31product81 < 0);
-    const int32_t vremainder82 = (vq31product82 & vremainder_mask) - (int32_t)(vq31product82 < 0);
-    const int32_t vremainder83 = (vq31product83 & vremainder_mask) - (int32_t)(vq31product83 < 0);
-    const int32_t vremainder90 = (vq31product90 & vremainder_mask) - (int32_t)(vq31product90 < 0);
-    const int32_t vremainder91 = (vq31product91 & vremainder_mask) - (int32_t)(vq31product91 < 0);
-    const int32_t vremainder92 = (vq31product92 & vremainder_mask) - (int32_t)(vq31product92 < 0);
-    const int32_t vremainder93 = (vq31product93 & vremainder_mask) - (int32_t)(vq31product93 < 0);
-    const int32_t vremainder100 = (vq31product100 & vremainder_mask) - (int32_t)(vq31product100 < 0);
-    const int32_t vremainder101 = (vq31product101 & vremainder_mask) - (int32_t)(vq31product101 < 0);
-    const int32_t vremainder102 = (vq31product102 & vremainder_mask) - (int32_t)(vq31product102 < 0);
-    const int32_t vremainder103 = (vq31product103 & vremainder_mask) - (int32_t)(vq31product103 < 0);
-    const int32_t vremainder110 = (vq31product110 & vremainder_mask) - (int32_t)(vq31product110 < 0);
-    const int32_t vremainder111 = (vq31product111 & vremainder_mask) - (int32_t)(vq31product111 < 0);
-    const int32_t vremainder112 = (vq31product112 & vremainder_mask) - (int32_t)(vq31product112 < 0);
-    const int32_t vremainder113 = (vq31product113 & vremainder_mask) - (int32_t)(vq31product113 < 0);
-
-    int32_t vout00 = asr_s32(vq31product00, vshift) + (int32_t)(vremainder00 > vremainder_threshold);
-    int32_t vout01 = asr_s32(vq31product01, vshift) + (int32_t)(vremainder01 > vremainder_threshold);
-    int32_t vout02 = asr_s32(vq31product02, vshift) + (int32_t)(vremainder02 > vremainder_threshold);
-    int32_t vout03 = asr_s32(vq31product03, vshift) + (int32_t)(vremainder03 > vremainder_threshold);
-    int32_t vout10 = asr_s32(vq31product10, vshift) + (int32_t)(vremainder10 > vremainder_threshold);
-    int32_t vout11 = asr_s32(vq31product11, vshift) + (int32_t)(vremainder11 > vremainder_threshold);
-    int32_t vout12 = asr_s32(vq31product12, vshift) + (int32_t)(vremainder12 > vremainder_threshold);
-    int32_t vout13 = asr_s32(vq31product13, vshift) + (int32_t)(vremainder13 > vremainder_threshold);
-    int32_t vout20 = asr_s32(vq31product20, vshift) + (int32_t)(vremainder20 > vremainder_threshold);
-    int32_t vout21 = asr_s32(vq31product21, vshift) + (int32_t)(vremainder21 > vremainder_threshold);
-    int32_t vout22 = asr_s32(vq31product22, vshift) + (int32_t)(vremainder22 > vremainder_threshold);
-    int32_t vout23 = asr_s32(vq31product23, vshift) + (int32_t)(vremainder23 > vremainder_threshold);
-    int32_t vout30 = asr_s32(vq31product30, vshift) + (int32_t)(vremainder30 > vremainder_threshold);
-    int32_t vout31 = asr_s32(vq31product31, vshift) + (int32_t)(vremainder31 > vremainder_threshold);
-    int32_t vout32 = asr_s32(vq31product32, vshift) + (int32_t)(vremainder32 > vremainder_threshold);
-    int32_t vout33 = asr_s32(vq31product33, vshift) + (int32_t)(vremainder33 > vremainder_threshold);
-    int32_t vout40 = asr_s32(vq31product40, vshift) + (int32_t)(vremainder40 > vremainder_threshold);
-    int32_t vout41 = asr_s32(vq31product41, vshift) + (int32_t)(vremainder41 > vremainder_threshold);
-    int32_t vout42 = asr_s32(vq31product42, vshift) + (int32_t)(vremainder42 > vremainder_threshold);
-    int32_t vout43 = asr_s32(vq31product43, vshift) + (int32_t)(vremainder43 > vremainder_threshold);
-    int32_t vout50 = asr_s32(vq31product50, vshift) + (int32_t)(vremainder50 > vremainder_threshold);
-    int32_t vout51 = asr_s32(vq31product51, vshift) + (int32_t)(vremainder51 > vremainder_threshold);
-    int32_t vout52 = asr_s32(vq31product52, vshift) + (int32_t)(vremainder52 > vremainder_threshold);
-    int32_t vout53 = asr_s32(vq31product53, vshift) + (int32_t)(vremainder53 > vremainder_threshold);
-    int32_t vout60 = asr_s32(vq31product60, vshift) + (int32_t)(vremainder60 > vremainder_threshold);
-    int32_t vout61 = asr_s32(vq31product61, vshift) + (int32_t)(vremainder61 > vremainder_threshold);
-    int32_t vout62 = asr_s32(vq31product62, vshift) + (int32_t)(vremainder62 > vremainder_threshold);
-    int32_t vout63 = asr_s32(vq31product63, vshift) + (int32_t)(vremainder63 > vremainder_threshold);
-    int32_t vout70 = asr_s32(vq31product70, vshift) + (int32_t)(vremainder70 > vremainder_threshold);
-    int32_t vout71 = asr_s32(vq31product71, vshift) + (int32_t)(vremainder71 > vremainder_threshold);
-    int32_t vout72 = asr_s32(vq31product72, vshift) + (int32_t)(vremainder72 > vremainder_threshold);
-    int32_t vout73 = asr_s32(vq31product73, vshift) + (int32_t)(vremainder73 > vremainder_threshold);
-    int32_t vout80 = asr_s32(vq31product80, vshift) + (int32_t)(vremainder80 > vremainder_threshold);
-    int32_t vout81 = asr_s32(vq31product81, vshift) + (int32_t)(vremainder81 > vremainder_threshold);
-    int32_t vout82 = asr_s32(vq31product82, vshift) + (int32_t)(vremainder82 > vremainder_threshold);
-    int32_t vout83 = asr_s32(vq31product83, vshift) + (int32_t)(vremainder83 > vremainder_threshold);
-    int32_t vout90 = asr_s32(vq31product90, vshift) + (int32_t)(vremainder90 > vremainder_threshold);
-    int32_t vout91 = asr_s32(vq31product91, vshift) + (int32_t)(vremainder91 > vremainder_threshold);
-    int32_t vout92 = asr_s32(vq31product92, vshift) + (int32_t)(vremainder92 > vremainder_threshold);
-    int32_t vout93 = asr_s32(vq31product93, vshift) + (int32_t)(vremainder93 > vremainder_threshold);
-    int32_t vout100 = asr_s32(vq31product100, vshift) + (int32_t)(vremainder100 > vremainder_threshold);
-    int32_t vout101 = asr_s32(vq31product101, vshift) + (int32_t)(vremainder101 > vremainder_threshold);
-    int32_t vout102 = asr_s32(vq31product102, vshift) + (int32_t)(vremainder102 > vremainder_threshold);
-    int32_t vout103 = asr_s32(vq31product103, vshift) + (int32_t)(vremainder103 > vremainder_threshold);
-    int32_t vout110 = asr_s32(vq31product110, vshift) + (int32_t)(vremainder110 > vremainder_threshold);
-    int32_t vout111 = asr_s32(vq31product111, vshift) + (int32_t)(vremainder111 > vremainder_threshold);
-    int32_t vout112 = asr_s32(vq31product112, vshift) + (int32_t)(vremainder112 > vremainder_threshold);
-    int32_t vout113 = asr_s32(vq31product113, vshift) + (int32_t)(vremainder113 > vremainder_threshold);
-
-    vout00 = vout00 < voutput_min ? voutput_min : vout00;
-    vout01 = vout01 < voutput_min ? voutput_min : vout01;
-    vout02 = vout02 < voutput_min ? voutput_min : vout02;
-    vout03 = vout03 < voutput_min ? voutput_min : vout03;
-    vout10 = vout10 < voutput_min ? voutput_min : vout10;
-    vout11 = vout11 < voutput_min ? voutput_min : vout11;
-    vout12 = vout12 < voutput_min ? voutput_min : vout12;
-    vout13 = vout13 < voutput_min ? voutput_min : vout13;
-    vout20 = vout20 < voutput_min ? voutput_min : vout20;
-    vout21 = vout21 < voutput_min ? voutput_min : vout21;
-    vout22 = vout22 < voutput_min ? voutput_min : vout22;
-    vout23 = vout23 < voutput_min ? voutput_min : vout23;
-    vout30 = vout30 < voutput_min ? voutput_min : vout30;
-    vout31 = vout31 < voutput_min ? voutput_min : vout31;
-    vout32 = vout32 < voutput_min ? voutput_min : vout32;
-    vout33 = vout33 < voutput_min ? voutput_min : vout33;
-    vout40 = vout40 < voutput_min ? voutput_min : vout40;
-    vout41 = vout41 < voutput_min ? voutput_min : vout41;
-    vout42 = vout42 < voutput_min ? voutput_min : vout42;
-    vout43 = vout43 < voutput_min ? voutput_min : vout43;
-    vout50 = vout50 < voutput_min ? voutput_min : vout50;
-    vout51 = vout51 < voutput_min ? voutput_min : vout51;
-    vout52 = vout52 < voutput_min ? voutput_min : vout52;
-    vout53 = vout53 < voutput_min ? voutput_min : vout53;
-    vout60 = vout60 < voutput_min ? voutput_min : vout60;
-    vout61 = vout61 < voutput_min ? voutput_min : vout61;
-    vout62 = vout62 < voutput_min ? voutput_min : vout62;
-    vout63 = vout63 < voutput_min ? voutput_min : vout63;
-    vout70 = vout70 < voutput_min ? voutput_min : vout70;
-    vout71 = vout71 < voutput_min ? voutput_min : vout71;
-    vout72 = vout72 < voutput_min ? voutput_min : vout72;
-    vout73 = vout73 < voutput_min ? voutput_min : vout73;
-    vout80 = vout80 < voutput_min ? voutput_min : vout80;
-    vout81 = vout81 < voutput_min ? voutput_min : vout81;
-    vout82 = vout82 < voutput_min ? voutput_min : vout82;
-    vout83 = vout83 < voutput_min ? voutput_min : vout83;
-    vout90 = vout90 < voutput_min ? voutput_min : vout90;
-    vout91 = vout91 < voutput_min ? voutput_min : vout91;
-    vout92 = vout92 < voutput_min ? voutput_min : vout92;
-    vout93 = vout93 < voutput_min ? voutput_min : vout93;
-    vout100 = vout100 < voutput_min ? voutput_min : vout100;
-    vout101 = vout101 < voutput_min ? voutput_min : vout101;
-    vout102 = vout102 < voutput_min ? voutput_min : vout102;
-    vout103 = vout103 < voutput_min ? voutput_min : vout103;
-    vout110 = vout110 < voutput_min ? voutput_min : vout110;
-    vout111 = vout111 < voutput_min ? voutput_min : vout111;
-    vout112 = vout112 < voutput_min ? voutput_min : vout112;
-    vout113 = vout113 < voutput_min ? voutput_min : vout113;
-
-    vout00 = vout00 > voutput_max ? voutput_max : vout00;
-    vout01 = vout01 > voutput_max ? voutput_max : vout01;
-    vout02 = vout02 > voutput_max ? voutput_max : vout02;
-    vout03 = vout03 > voutput_max ? voutput_max : vout03;
-    vout10 = vout10 > voutput_max ? voutput_max : vout10;
-    vout11 = vout11 > voutput_max ? voutput_max : vout11;
-    vout12 = vout12 > voutput_max ? voutput_max : vout12;
-    vout13 = vout13 > voutput_max ? voutput_max : vout13;
-    vout20 = vout20 > voutput_max ? voutput_max : vout20;
-    vout21 = vout21 > voutput_max ? voutput_max : vout21;
-    vout22 = vout22 > voutput_max ? voutput_max : vout22;
-    vout23 = vout23 > voutput_max ? voutput_max : vout23;
-    vout30 = vout30 > voutput_max ? voutput_max : vout30;
-    vout31 = vout31 > voutput_max ? voutput_max : vout31;
-    vout32 = vout32 > voutput_max ? voutput_max : vout32;
-    vout33 = vout33 > voutput_max ? voutput_max : vout33;
-    vout40 = vout40 > voutput_max ? voutput_max : vout40;
-    vout41 = vout41 > voutput_max ? voutput_max : vout41;
-    vout42 = vout42 > voutput_max ? voutput_max : vout42;
-    vout43 = vout43 > voutput_max ? voutput_max : vout43;
-    vout50 = vout50 > voutput_max ? voutput_max : vout50;
-    vout51 = vout51 > voutput_max ? voutput_max : vout51;
-    vout52 = vout52 > voutput_max ? voutput_max : vout52;
-    vout53 = vout53 > voutput_max ? voutput_max : vout53;
-    vout60 = vout60 > voutput_max ? voutput_max : vout60;
-    vout61 = vout61 > voutput_max ? voutput_max : vout61;
-    vout62 = vout62 > voutput_max ? voutput_max : vout62;
-    vout63 = vout63 > voutput_max ? voutput_max : vout63;
-    vout70 = vout70 > voutput_max ? voutput_max : vout70;
-    vout71 = vout71 > voutput_max ? voutput_max : vout71;
-    vout72 = vout72 > voutput_max ? voutput_max : vout72;
-    vout73 = vout73 > voutput_max ? voutput_max : vout73;
-    vout80 = vout80 > voutput_max ? voutput_max : vout80;
-    vout81 = vout81 > voutput_max ? voutput_max : vout81;
-    vout82 = vout82 > voutput_max ? voutput_max : vout82;
-    vout83 = vout83 > voutput_max ? voutput_max : vout83;
-    vout90 = vout90 > voutput_max ? voutput_max : vout90;
-    vout91 = vout91 > voutput_max ? voutput_max : vout91;
-    vout92 = vout92 > voutput_max ? voutput_max : vout92;
-    vout93 = vout93 > voutput_max ? voutput_max : vout93;
-    vout100 = vout100 > voutput_max ? voutput_max : vout100;
-    vout101 = vout101 > voutput_max ? voutput_max : vout101;
-    vout102 = vout102 > voutput_max ? voutput_max : vout102;
-    vout103 = vout103 > voutput_max ? voutput_max : vout103;
-    vout110 = vout110 > voutput_max ? voutput_max : vout110;
-    vout111 = vout111 > voutput_max ? voutput_max : vout111;
-    vout112 = vout112 > voutput_max ? voutput_max : vout112;
-    vout113 = vout113 > voutput_max ? voutput_max : vout113;
-
-    vout00 += voutput_zero_point;
-    vout01 += voutput_zero_point;
-    vout02 += voutput_zero_point;
-    vout03 += voutput_zero_point;
-    vout10 += voutput_zero_point;
-    vout11 += voutput_zero_point;
-    vout12 += voutput_zero_point;
-    vout13 += voutput_zero_point;
-    vout20 += voutput_zero_point;
-    vout21 += voutput_zero_point;
-    vout22 += voutput_zero_point;
-    vout23 += voutput_zero_point;
-    vout30 += voutput_zero_point;
-    vout31 += voutput_zero_point;
-    vout32 += voutput_zero_point;
-    vout33 += voutput_zero_point;
-    vout40 += voutput_zero_point;
-    vout41 += voutput_zero_point;
-    vout42 += voutput_zero_point;
-    vout43 += voutput_zero_point;
-    vout50 += voutput_zero_point;
-    vout51 += voutput_zero_point;
-    vout52 += voutput_zero_point;
-    vout53 += voutput_zero_point;
-    vout60 += voutput_zero_point;
-    vout61 += voutput_zero_point;
-    vout62 += voutput_zero_point;
-    vout63 += voutput_zero_point;
-    vout70 += voutput_zero_point;
-    vout71 += voutput_zero_point;
-    vout72 += voutput_zero_point;
-    vout73 += voutput_zero_point;
-    vout80 += voutput_zero_point;
-    vout81 += voutput_zero_point;
-    vout82 += voutput_zero_point;
-    vout83 += voutput_zero_point;
-    vout90 += voutput_zero_point;
-    vout91 += voutput_zero_point;
-    vout92 += voutput_zero_point;
-    vout93 += voutput_zero_point;
-    vout100 += voutput_zero_point;
-    vout101 += voutput_zero_point;
-    vout102 += voutput_zero_point;
-    vout103 += voutput_zero_point;
-    vout110 += voutput_zero_point;
-    vout111 += voutput_zero_point;
-    vout112 += voutput_zero_point;
-    vout113 += voutput_zero_point;
-
-    if XNN_LIKELY (nc >= 4) {
-      // Main case where there the 4 columns fit in the destination.
-      c0[0] = vout00;
-      c0[1] = vout01;
-      c0[2] = vout02;
-      c0[3] = vout03;
-      c1[0] = vout10;
-      c1[1] = vout11;
-      c1[2] = vout12;
-      c1[3] = vout13;
-      c2[0] = vout20;
-      c2[1] = vout21;
-      c2[2] = vout22;
-      c2[3] = vout23;
-      c3[0] = vout30;
-      c3[1] = vout31;
-      c3[2] = vout32;
-      c3[3] = vout33;
-      c4[0] = vout40;
-      c4[1] = vout41;
-      c4[2] = vout42;
-      c4[3] = vout43;
-      c5[0] = vout50;
-      c5[1] = vout51;
-      c5[2] = vout52;
-      c5[3] = vout53;
-      c6[0] = vout60;
-      c6[1] = vout61;
-      c6[2] = vout62;
-      c6[3] = vout63;
-      c7[0] = vout70;
-      c7[1] = vout71;
-      c7[2] = vout72;
-      c7[3] = vout73;
-      c8[0] = vout80;
-      c8[1] = vout81;
-      c8[2] = vout82;
-      c8[3] = vout83;
-      c9[0] = vout90;
-      c9[1] = vout91;
-      c9[2] = vout92;
-      c9[3] = vout93;
-      c10[0] = vout100;
-      c10[1] = vout101;
-      c10[2] = vout102;
-      c10[3] = vout103;
-      c11[0] = vout110;
-      c11[1] = vout111;
-      c11[2] = vout112;
-      c11[3] = vout113;
-
-      // Advance to the next 4 columns.
-      c0 = (uint8_t*)((uintptr_t)c0 + cn_stride);
-      c1 = (uint8_t*)((uintptr_t)c1 + cn_stride);
-      c2 = (uint8_t*)((uintptr_t)c2 + cn_stride);
-      c3 = (uint8_t*)((uintptr_t)c3 + cn_stride);
-      c4 = (uint8_t*)((uintptr_t)c4 + cn_stride);
-      c5 = (uint8_t*)((uintptr_t)c5 + cn_stride);
-      c6 = (uint8_t*)((uintptr_t)c6 + cn_stride);
-      c7 = (uint8_t*)((uintptr_t)c7 + cn_stride);
-      c8 = (uint8_t*)((uintptr_t)c8 + cn_stride);
-      c9 = (uint8_t*)((uintptr_t)c9 + cn_stride);
-      c10 = (uint8_t*)((uintptr_t)c10 + cn_stride);
-      c11 = (uint8_t*)((uintptr_t)c11 + cn_stride);
-
-      nc -= 4;
-    } else {
-      // Final case where not all of the 4 columns fit in the destination.
-      if (nc > 0) {
-        c0[0] = vout00;
-        c1[0] = vout10;
-        c2[0] = vout20;
-        c3[0] = vout30;
-        c4[0] = vout40;
-        c5[0] = vout50;
-        c6[0] = vout60;
-        c7[0] = vout70;
-        c8[0] = vout80;
-        c9[0] = vout90;
-        c10[0] = vout100;
-        c11[0] = vout110;
-      }
-      if (nc > 1) {
-        c0[1] = vout01;
-        c1[1] = vout11;
-        c2[1] = vout21;
-        c3[1] = vout31;
-        c4[1] = vout41;
-        c5[1] = vout51;
-        c6[1] = vout61;
-        c7[1] = vout71;
-        c8[1] = vout81;
-        c9[1] = vout91;
-        c10[1] = vout101;
-        c11[1] = vout111;
-      }
-      if (nc > 2) {
-        c0[2] = vout02;
-        c1[2] = vout12;
-        c2[2] = vout22;
-        c3[2] = vout32;
-        c4[2] = vout42;
-        c5[2] = vout52;
-        c6[2] = vout62;
-        c7[2] = vout72;
-        c8[2] = vout82;
-        c9[2] = vout92;
-        c10[2] = vout102;
-        c11[2] = vout112;
-      }
-      if (nc > 3) {
-        c0[3] = vout03;
-        c1[3] = vout13;
-        c2[3] = vout23;
-        c3[3] = vout33;
-        c4[3] = vout43;
-        c5[3] = vout53;
-        c6[3] = vout63;
-        c7[3] = vout73;
-        c8[3] = vout83;
-        c9[3] = vout93;
-        c10[3] = vout103;
-        c11[3] = vout113;
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/qu8-gemm/gen/8x8c4-minmax-scalar.c b/src/qu8-gemm/gen/8x8c4-minmax-scalar.c
deleted file mode 100644
index 57297de..0000000
--- a/src/qu8-gemm/gen/8x8c4-minmax-scalar.c
+++ /dev/null
@@ -1,1263 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/qu8-gemm/MRxNRc4-minmax-scalar.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <xnnpack/gemm.h>
-
-#include <xnnpack/scalar-utils.h>
-
-// This kernel is a scalar model for a kernel using ARMv8.2 dot-product
-// instructions.
-//
-// XNN_DISABLE_TSAN is used because this kernel reads up to 3 bytes past the
-// bounds of the `a` matrix region, which may be a race condition with
-// another thread. We deem this acceptable because the values that are
-// read out of bounds do not affect the result, and the the compiler can't know
-// about this undefined behavior.
-void xnn_qu8_gemm_minmax_ukernel_8x8c4__scalar(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    const uint8_t* restrict a,
-    size_t a_stride,
-    const void* restrict w,
-    uint8_t* restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    const union xnn_qu8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN {
-  assert(mr != 0);
-  assert(mr <= 8);
-  assert(nc != 0);
-  assert(kc != 0);
-
-  const uint8_t* a0 = a;
-  uint8_t* c0 = c;
-  const uint8_t* a1 = (const uint8_t*) ((uintptr_t) a0 + a_stride);
-  uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 2) {
-    a1 = a0;
-    c1 = c0;
-  }
-  const uint8_t* a2 = (const uint8_t*) ((uintptr_t) a1 + a_stride);
-  uint8_t* c2 = (uint8_t*) ((uintptr_t) c1 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 2) {
-    a2 = a1;
-    c2 = c1;
-  }
-  const uint8_t* a3 = (const uint8_t*) ((uintptr_t) a2 + a_stride);
-  uint8_t* c3 = (uint8_t*) ((uintptr_t) c2 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 4) {
-    a3 = a2;
-    c3 = c2;
-  }
-  const uint8_t* a4 = (const uint8_t*) ((uintptr_t) a3 + a_stride);
-  uint8_t* c4 = (uint8_t*) ((uintptr_t) c3 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 4) {
-    a4 = a3;
-    c4 = c3;
-  }
-  const uint8_t* a5 = (const uint8_t*) ((uintptr_t) a4 + a_stride);
-  uint8_t* c5 = (uint8_t*) ((uintptr_t) c4 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 6) {
-    a5 = a4;
-    c5 = c4;
-  }
-  const uint8_t* a6 = (const uint8_t*) ((uintptr_t) a5 + a_stride);
-  uint8_t* c6 = (uint8_t*) ((uintptr_t) c5 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 6) {
-    a6 = a5;
-    c6 = c5;
-  }
-  const uint8_t* a7 = (const uint8_t*) ((uintptr_t) a6 + a_stride);
-  uint8_t* c7 = (uint8_t*) ((uintptr_t) c6 + cm_stride);
-  if XNN_UNPREDICTABLE(mr != 8) {
-    a7 = a6;
-    c7 = c6;
-  }
-
-  const int32_t vb_zero_point = params->scalar.kernel_zero_point;
-
-  // Loop over groups of 8 columns.
-  do {
-    // `vaccMN` is the accumulator at row `M`, column `N`.
-    // Initialize accumulators with bias. 8 bias values are loaded from the
-    // weight matrix, at the start of the group of 8 columns.
-    int32_t bias0 = ((const int32_t*)w)[0];
-    int32_t vacc00 = bias0;
-    int32_t vacc10 = bias0;
-    int32_t vacc20 = bias0;
-    int32_t vacc30 = bias0;
-    int32_t vacc40 = bias0;
-    int32_t vacc50 = bias0;
-    int32_t vacc60 = bias0;
-    int32_t vacc70 = bias0;
-    int32_t bias1 = ((const int32_t*)w)[1];
-    int32_t vacc01 = bias1;
-    int32_t vacc11 = bias1;
-    int32_t vacc21 = bias1;
-    int32_t vacc31 = bias1;
-    int32_t vacc41 = bias1;
-    int32_t vacc51 = bias1;
-    int32_t vacc61 = bias1;
-    int32_t vacc71 = bias1;
-    int32_t bias2 = ((const int32_t*)w)[2];
-    int32_t vacc02 = bias2;
-    int32_t vacc12 = bias2;
-    int32_t vacc22 = bias2;
-    int32_t vacc32 = bias2;
-    int32_t vacc42 = bias2;
-    int32_t vacc52 = bias2;
-    int32_t vacc62 = bias2;
-    int32_t vacc72 = bias2;
-    int32_t bias3 = ((const int32_t*)w)[3];
-    int32_t vacc03 = bias3;
-    int32_t vacc13 = bias3;
-    int32_t vacc23 = bias3;
-    int32_t vacc33 = bias3;
-    int32_t vacc43 = bias3;
-    int32_t vacc53 = bias3;
-    int32_t vacc63 = bias3;
-    int32_t vacc73 = bias3;
-    int32_t bias4 = ((const int32_t*)w)[4];
-    int32_t vacc04 = bias4;
-    int32_t vacc14 = bias4;
-    int32_t vacc24 = bias4;
-    int32_t vacc34 = bias4;
-    int32_t vacc44 = bias4;
-    int32_t vacc54 = bias4;
-    int32_t vacc64 = bias4;
-    int32_t vacc74 = bias4;
-    int32_t bias5 = ((const int32_t*)w)[5];
-    int32_t vacc05 = bias5;
-    int32_t vacc15 = bias5;
-    int32_t vacc25 = bias5;
-    int32_t vacc35 = bias5;
-    int32_t vacc45 = bias5;
-    int32_t vacc55 = bias5;
-    int32_t vacc65 = bias5;
-    int32_t vacc75 = bias5;
-    int32_t bias6 = ((const int32_t*)w)[6];
-    int32_t vacc06 = bias6;
-    int32_t vacc16 = bias6;
-    int32_t vacc26 = bias6;
-    int32_t vacc36 = bias6;
-    int32_t vacc46 = bias6;
-    int32_t vacc56 = bias6;
-    int32_t vacc66 = bias6;
-    int32_t vacc76 = bias6;
-    int32_t bias7 = ((const int32_t*)w)[7];
-    int32_t vacc07 = bias7;
-    int32_t vacc17 = bias7;
-    int32_t vacc27 = bias7;
-    int32_t vacc37 = bias7;
-    int32_t vacc47 = bias7;
-    int32_t vacc57 = bias7;
-    int32_t vacc67 = bias7;
-    int32_t vacc77 = bias7;
-
-    w = (const void*)((uintptr_t)w + 8 * sizeof(int32_t));
-
-    // Inner accumulation loop along the 8 columns.
-    // Handle 4 rows at each iteration: this is key to modelling what an
-    // actual kernel using ARMv8.2 dot-product instructions would look like.
-    size_t k = 0;
-    while (k < kc) {
-      // Load a 8x4 block of activations, and compute sums along rows.
-      int16_t vasum0 = 0;
-      int32_t va00 = *a0++;
-      vasum0 += (int16_t) va00;
-      int32_t va01 = *a0++;
-      vasum0 += (int16_t) va01;
-      int32_t va02 = *a0++;
-      vasum0 += (int16_t) va02;
-      int32_t va03 = *a0++;
-      vasum0 += (int16_t) va03;
-      int16_t vasum1 = 0;
-      int32_t va10 = *a1++;
-      vasum1 += (int16_t) va10;
-      int32_t va11 = *a1++;
-      vasum1 += (int16_t) va11;
-      int32_t va12 = *a1++;
-      vasum1 += (int16_t) va12;
-      int32_t va13 = *a1++;
-      vasum1 += (int16_t) va13;
-      int16_t vasum2 = 0;
-      int32_t va20 = *a2++;
-      vasum2 += (int16_t) va20;
-      int32_t va21 = *a2++;
-      vasum2 += (int16_t) va21;
-      int32_t va22 = *a2++;
-      vasum2 += (int16_t) va22;
-      int32_t va23 = *a2++;
-      vasum2 += (int16_t) va23;
-      int16_t vasum3 = 0;
-      int32_t va30 = *a3++;
-      vasum3 += (int16_t) va30;
-      int32_t va31 = *a3++;
-      vasum3 += (int16_t) va31;
-      int32_t va32 = *a3++;
-      vasum3 += (int16_t) va32;
-      int32_t va33 = *a3++;
-      vasum3 += (int16_t) va33;
-      int16_t vasum4 = 0;
-      int32_t va40 = *a4++;
-      vasum4 += (int16_t) va40;
-      int32_t va41 = *a4++;
-      vasum4 += (int16_t) va41;
-      int32_t va42 = *a4++;
-      vasum4 += (int16_t) va42;
-      int32_t va43 = *a4++;
-      vasum4 += (int16_t) va43;
-      int16_t vasum5 = 0;
-      int32_t va50 = *a5++;
-      vasum5 += (int16_t) va50;
-      int32_t va51 = *a5++;
-      vasum5 += (int16_t) va51;
-      int32_t va52 = *a5++;
-      vasum5 += (int16_t) va52;
-      int32_t va53 = *a5++;
-      vasum5 += (int16_t) va53;
-      int16_t vasum6 = 0;
-      int32_t va60 = *a6++;
-      vasum6 += (int16_t) va60;
-      int32_t va61 = *a6++;
-      vasum6 += (int16_t) va61;
-      int32_t va62 = *a6++;
-      vasum6 += (int16_t) va62;
-      int32_t va63 = *a6++;
-      vasum6 += (int16_t) va63;
-      int16_t vasum7 = 0;
-      int32_t va70 = *a7++;
-      vasum7 += (int16_t) va70;
-      int32_t va71 = *a7++;
-      vasum7 += (int16_t) va71;
-      int32_t va72 = *a7++;
-      vasum7 += (int16_t) va72;
-      int32_t va73 = *a7++;
-      vasum7 += (int16_t) va73;
-
-      // Load a 4x8 block of weights.
-      int32_t vb00 = (int32_t) ((const uint8_t*)w)[0];
-      int32_t vb10 = (int32_t) ((const uint8_t*)w)[1];
-      int32_t vb20 = (int32_t) ((const uint8_t*)w)[2];
-      int32_t vb30 = (int32_t) ((const uint8_t*)w)[3];
-
-      w = (const void*)((uintptr_t)w + 4 * sizeof(uint8_t));
-      int32_t vb01 = (int32_t) ((const uint8_t*)w)[0];
-      int32_t vb11 = (int32_t) ((const uint8_t*)w)[1];
-      int32_t vb21 = (int32_t) ((const uint8_t*)w)[2];
-      int32_t vb31 = (int32_t) ((const uint8_t*)w)[3];
-
-      w = (const void*)((uintptr_t)w + 4 * sizeof(uint8_t));
-      int32_t vb02 = (int32_t) ((const uint8_t*)w)[0];
-      int32_t vb12 = (int32_t) ((const uint8_t*)w)[1];
-      int32_t vb22 = (int32_t) ((const uint8_t*)w)[2];
-      int32_t vb32 = (int32_t) ((const uint8_t*)w)[3];
-
-      w = (const void*)((uintptr_t)w + 4 * sizeof(uint8_t));
-      int32_t vb03 = (int32_t) ((const uint8_t*)w)[0];
-      int32_t vb13 = (int32_t) ((const uint8_t*)w)[1];
-      int32_t vb23 = (int32_t) ((const uint8_t*)w)[2];
-      int32_t vb33 = (int32_t) ((const uint8_t*)w)[3];
-
-      w = (const void*)((uintptr_t)w + 4 * sizeof(uint8_t));
-      int32_t vb04 = (int32_t) ((const uint8_t*)w)[0];
-      int32_t vb14 = (int32_t) ((const uint8_t*)w)[1];
-      int32_t vb24 = (int32_t) ((const uint8_t*)w)[2];
-      int32_t vb34 = (int32_t) ((const uint8_t*)w)[3];
-
-      w = (const void*)((uintptr_t)w + 4 * sizeof(uint8_t));
-      int32_t vb05 = (int32_t) ((const uint8_t*)w)[0];
-      int32_t vb15 = (int32_t) ((const uint8_t*)w)[1];
-      int32_t vb25 = (int32_t) ((const uint8_t*)w)[2];
-      int32_t vb35 = (int32_t) ((const uint8_t*)w)[3];
-
-      w = (const void*)((uintptr_t)w + 4 * sizeof(uint8_t));
-      int32_t vb06 = (int32_t) ((const uint8_t*)w)[0];
-      int32_t vb16 = (int32_t) ((const uint8_t*)w)[1];
-      int32_t vb26 = (int32_t) ((const uint8_t*)w)[2];
-      int32_t vb36 = (int32_t) ((const uint8_t*)w)[3];
-
-      w = (const void*)((uintptr_t)w + 4 * sizeof(uint8_t));
-      int32_t vb07 = (int32_t) ((const uint8_t*)w)[0];
-      int32_t vb17 = (int32_t) ((const uint8_t*)w)[1];
-      int32_t vb27 = (int32_t) ((const uint8_t*)w)[2];
-      int32_t vb37 = (int32_t) ((const uint8_t*)w)[3];
-
-      w = (const void*)((uintptr_t)w + 4 * sizeof(uint8_t));
-
-      // Multiply-accumulate: 8x4 * 4x8 --> 8x8. The inner size 4 here means
-      // we're computing 4D dot-products, which makes this a model for
-      // a ARMv8.2 dot-product kernel.
-      vacc00 += va00 * vb00;
-      vacc00 += va01 * vb10;
-      vacc00 += va02 * vb20;
-      vacc00 += va03 * vb30;
-      vacc00 -= ((int32_t) vasum0) * vb_zero_point;
-      vacc01 += va00 * vb01;
-      vacc01 += va01 * vb11;
-      vacc01 += va02 * vb21;
-      vacc01 += va03 * vb31;
-      vacc01 -= ((int32_t) vasum0) * vb_zero_point;
-      vacc02 += va00 * vb02;
-      vacc02 += va01 * vb12;
-      vacc02 += va02 * vb22;
-      vacc02 += va03 * vb32;
-      vacc02 -= ((int32_t) vasum0) * vb_zero_point;
-      vacc03 += va00 * vb03;
-      vacc03 += va01 * vb13;
-      vacc03 += va02 * vb23;
-      vacc03 += va03 * vb33;
-      vacc03 -= ((int32_t) vasum0) * vb_zero_point;
-      vacc04 += va00 * vb04;
-      vacc04 += va01 * vb14;
-      vacc04 += va02 * vb24;
-      vacc04 += va03 * vb34;
-      vacc04 -= ((int32_t) vasum0) * vb_zero_point;
-      vacc05 += va00 * vb05;
-      vacc05 += va01 * vb15;
-      vacc05 += va02 * vb25;
-      vacc05 += va03 * vb35;
-      vacc05 -= ((int32_t) vasum0) * vb_zero_point;
-      vacc06 += va00 * vb06;
-      vacc06 += va01 * vb16;
-      vacc06 += va02 * vb26;
-      vacc06 += va03 * vb36;
-      vacc06 -= ((int32_t) vasum0) * vb_zero_point;
-      vacc07 += va00 * vb07;
-      vacc07 += va01 * vb17;
-      vacc07 += va02 * vb27;
-      vacc07 += va03 * vb37;
-      vacc07 -= ((int32_t) vasum0) * vb_zero_point;
-      vacc10 += va10 * vb00;
-      vacc10 += va11 * vb10;
-      vacc10 += va12 * vb20;
-      vacc10 += va13 * vb30;
-      vacc10 -= ((int32_t) vasum1) * vb_zero_point;
-      vacc11 += va10 * vb01;
-      vacc11 += va11 * vb11;
-      vacc11 += va12 * vb21;
-      vacc11 += va13 * vb31;
-      vacc11 -= ((int32_t) vasum1) * vb_zero_point;
-      vacc12 += va10 * vb02;
-      vacc12 += va11 * vb12;
-      vacc12 += va12 * vb22;
-      vacc12 += va13 * vb32;
-      vacc12 -= ((int32_t) vasum1) * vb_zero_point;
-      vacc13 += va10 * vb03;
-      vacc13 += va11 * vb13;
-      vacc13 += va12 * vb23;
-      vacc13 += va13 * vb33;
-      vacc13 -= ((int32_t) vasum1) * vb_zero_point;
-      vacc14 += va10 * vb04;
-      vacc14 += va11 * vb14;
-      vacc14 += va12 * vb24;
-      vacc14 += va13 * vb34;
-      vacc14 -= ((int32_t) vasum1) * vb_zero_point;
-      vacc15 += va10 * vb05;
-      vacc15 += va11 * vb15;
-      vacc15 += va12 * vb25;
-      vacc15 += va13 * vb35;
-      vacc15 -= ((int32_t) vasum1) * vb_zero_point;
-      vacc16 += va10 * vb06;
-      vacc16 += va11 * vb16;
-      vacc16 += va12 * vb26;
-      vacc16 += va13 * vb36;
-      vacc16 -= ((int32_t) vasum1) * vb_zero_point;
-      vacc17 += va10 * vb07;
-      vacc17 += va11 * vb17;
-      vacc17 += va12 * vb27;
-      vacc17 += va13 * vb37;
-      vacc17 -= ((int32_t) vasum1) * vb_zero_point;
-      vacc20 += va20 * vb00;
-      vacc20 += va21 * vb10;
-      vacc20 += va22 * vb20;
-      vacc20 += va23 * vb30;
-      vacc20 -= ((int32_t) vasum2) * vb_zero_point;
-      vacc21 += va20 * vb01;
-      vacc21 += va21 * vb11;
-      vacc21 += va22 * vb21;
-      vacc21 += va23 * vb31;
-      vacc21 -= ((int32_t) vasum2) * vb_zero_point;
-      vacc22 += va20 * vb02;
-      vacc22 += va21 * vb12;
-      vacc22 += va22 * vb22;
-      vacc22 += va23 * vb32;
-      vacc22 -= ((int32_t) vasum2) * vb_zero_point;
-      vacc23 += va20 * vb03;
-      vacc23 += va21 * vb13;
-      vacc23 += va22 * vb23;
-      vacc23 += va23 * vb33;
-      vacc23 -= ((int32_t) vasum2) * vb_zero_point;
-      vacc24 += va20 * vb04;
-      vacc24 += va21 * vb14;
-      vacc24 += va22 * vb24;
-      vacc24 += va23 * vb34;
-      vacc24 -= ((int32_t) vasum2) * vb_zero_point;
-      vacc25 += va20 * vb05;
-      vacc25 += va21 * vb15;
-      vacc25 += va22 * vb25;
-      vacc25 += va23 * vb35;
-      vacc25 -= ((int32_t) vasum2) * vb_zero_point;
-      vacc26 += va20 * vb06;
-      vacc26 += va21 * vb16;
-      vacc26 += va22 * vb26;
-      vacc26 += va23 * vb36;
-      vacc26 -= ((int32_t) vasum2) * vb_zero_point;
-      vacc27 += va20 * vb07;
-      vacc27 += va21 * vb17;
-      vacc27 += va22 * vb27;
-      vacc27 += va23 * vb37;
-      vacc27 -= ((int32_t) vasum2) * vb_zero_point;
-      vacc30 += va30 * vb00;
-      vacc30 += va31 * vb10;
-      vacc30 += va32 * vb20;
-      vacc30 += va33 * vb30;
-      vacc30 -= ((int32_t) vasum3) * vb_zero_point;
-      vacc31 += va30 * vb01;
-      vacc31 += va31 * vb11;
-      vacc31 += va32 * vb21;
-      vacc31 += va33 * vb31;
-      vacc31 -= ((int32_t) vasum3) * vb_zero_point;
-      vacc32 += va30 * vb02;
-      vacc32 += va31 * vb12;
-      vacc32 += va32 * vb22;
-      vacc32 += va33 * vb32;
-      vacc32 -= ((int32_t) vasum3) * vb_zero_point;
-      vacc33 += va30 * vb03;
-      vacc33 += va31 * vb13;
-      vacc33 += va32 * vb23;
-      vacc33 += va33 * vb33;
-      vacc33 -= ((int32_t) vasum3) * vb_zero_point;
-      vacc34 += va30 * vb04;
-      vacc34 += va31 * vb14;
-      vacc34 += va32 * vb24;
-      vacc34 += va33 * vb34;
-      vacc34 -= ((int32_t) vasum3) * vb_zero_point;
-      vacc35 += va30 * vb05;
-      vacc35 += va31 * vb15;
-      vacc35 += va32 * vb25;
-      vacc35 += va33 * vb35;
-      vacc35 -= ((int32_t) vasum3) * vb_zero_point;
-      vacc36 += va30 * vb06;
-      vacc36 += va31 * vb16;
-      vacc36 += va32 * vb26;
-      vacc36 += va33 * vb36;
-      vacc36 -= ((int32_t) vasum3) * vb_zero_point;
-      vacc37 += va30 * vb07;
-      vacc37 += va31 * vb17;
-      vacc37 += va32 * vb27;
-      vacc37 += va33 * vb37;
-      vacc37 -= ((int32_t) vasum3) * vb_zero_point;
-      vacc40 += va40 * vb00;
-      vacc40 += va41 * vb10;
-      vacc40 += va42 * vb20;
-      vacc40 += va43 * vb30;
-      vacc40 -= ((int32_t) vasum4) * vb_zero_point;
-      vacc41 += va40 * vb01;
-      vacc41 += va41 * vb11;
-      vacc41 += va42 * vb21;
-      vacc41 += va43 * vb31;
-      vacc41 -= ((int32_t) vasum4) * vb_zero_point;
-      vacc42 += va40 * vb02;
-      vacc42 += va41 * vb12;
-      vacc42 += va42 * vb22;
-      vacc42 += va43 * vb32;
-      vacc42 -= ((int32_t) vasum4) * vb_zero_point;
-      vacc43 += va40 * vb03;
-      vacc43 += va41 * vb13;
-      vacc43 += va42 * vb23;
-      vacc43 += va43 * vb33;
-      vacc43 -= ((int32_t) vasum4) * vb_zero_point;
-      vacc44 += va40 * vb04;
-      vacc44 += va41 * vb14;
-      vacc44 += va42 * vb24;
-      vacc44 += va43 * vb34;
-      vacc44 -= ((int32_t) vasum4) * vb_zero_point;
-      vacc45 += va40 * vb05;
-      vacc45 += va41 * vb15;
-      vacc45 += va42 * vb25;
-      vacc45 += va43 * vb35;
-      vacc45 -= ((int32_t) vasum4) * vb_zero_point;
-      vacc46 += va40 * vb06;
-      vacc46 += va41 * vb16;
-      vacc46 += va42 * vb26;
-      vacc46 += va43 * vb36;
-      vacc46 -= ((int32_t) vasum4) * vb_zero_point;
-      vacc47 += va40 * vb07;
-      vacc47 += va41 * vb17;
-      vacc47 += va42 * vb27;
-      vacc47 += va43 * vb37;
-      vacc47 -= ((int32_t) vasum4) * vb_zero_point;
-      vacc50 += va50 * vb00;
-      vacc50 += va51 * vb10;
-      vacc50 += va52 * vb20;
-      vacc50 += va53 * vb30;
-      vacc50 -= ((int32_t) vasum5) * vb_zero_point;
-      vacc51 += va50 * vb01;
-      vacc51 += va51 * vb11;
-      vacc51 += va52 * vb21;
-      vacc51 += va53 * vb31;
-      vacc51 -= ((int32_t) vasum5) * vb_zero_point;
-      vacc52 += va50 * vb02;
-      vacc52 += va51 * vb12;
-      vacc52 += va52 * vb22;
-      vacc52 += va53 * vb32;
-      vacc52 -= ((int32_t) vasum5) * vb_zero_point;
-      vacc53 += va50 * vb03;
-      vacc53 += va51 * vb13;
-      vacc53 += va52 * vb23;
-      vacc53 += va53 * vb33;
-      vacc53 -= ((int32_t) vasum5) * vb_zero_point;
-      vacc54 += va50 * vb04;
-      vacc54 += va51 * vb14;
-      vacc54 += va52 * vb24;
-      vacc54 += va53 * vb34;
-      vacc54 -= ((int32_t) vasum5) * vb_zero_point;
-      vacc55 += va50 * vb05;
-      vacc55 += va51 * vb15;
-      vacc55 += va52 * vb25;
-      vacc55 += va53 * vb35;
-      vacc55 -= ((int32_t) vasum5) * vb_zero_point;
-      vacc56 += va50 * vb06;
-      vacc56 += va51 * vb16;
-      vacc56 += va52 * vb26;
-      vacc56 += va53 * vb36;
-      vacc56 -= ((int32_t) vasum5) * vb_zero_point;
-      vacc57 += va50 * vb07;
-      vacc57 += va51 * vb17;
-      vacc57 += va52 * vb27;
-      vacc57 += va53 * vb37;
-      vacc57 -= ((int32_t) vasum5) * vb_zero_point;
-      vacc60 += va60 * vb00;
-      vacc60 += va61 * vb10;
-      vacc60 += va62 * vb20;
-      vacc60 += va63 * vb30;
-      vacc60 -= ((int32_t) vasum6) * vb_zero_point;
-      vacc61 += va60 * vb01;
-      vacc61 += va61 * vb11;
-      vacc61 += va62 * vb21;
-      vacc61 += va63 * vb31;
-      vacc61 -= ((int32_t) vasum6) * vb_zero_point;
-      vacc62 += va60 * vb02;
-      vacc62 += va61 * vb12;
-      vacc62 += va62 * vb22;
-      vacc62 += va63 * vb32;
-      vacc62 -= ((int32_t) vasum6) * vb_zero_point;
-      vacc63 += va60 * vb03;
-      vacc63 += va61 * vb13;
-      vacc63 += va62 * vb23;
-      vacc63 += va63 * vb33;
-      vacc63 -= ((int32_t) vasum6) * vb_zero_point;
-      vacc64 += va60 * vb04;
-      vacc64 += va61 * vb14;
-      vacc64 += va62 * vb24;
-      vacc64 += va63 * vb34;
-      vacc64 -= ((int32_t) vasum6) * vb_zero_point;
-      vacc65 += va60 * vb05;
-      vacc65 += va61 * vb15;
-      vacc65 += va62 * vb25;
-      vacc65 += va63 * vb35;
-      vacc65 -= ((int32_t) vasum6) * vb_zero_point;
-      vacc66 += va60 * vb06;
-      vacc66 += va61 * vb16;
-      vacc66 += va62 * vb26;
-      vacc66 += va63 * vb36;
-      vacc66 -= ((int32_t) vasum6) * vb_zero_point;
-      vacc67 += va60 * vb07;
-      vacc67 += va61 * vb17;
-      vacc67 += va62 * vb27;
-      vacc67 += va63 * vb37;
-      vacc67 -= ((int32_t) vasum6) * vb_zero_point;
-      vacc70 += va70 * vb00;
-      vacc70 += va71 * vb10;
-      vacc70 += va72 * vb20;
-      vacc70 += va73 * vb30;
-      vacc70 -= ((int32_t) vasum7) * vb_zero_point;
-      vacc71 += va70 * vb01;
-      vacc71 += va71 * vb11;
-      vacc71 += va72 * vb21;
-      vacc71 += va73 * vb31;
-      vacc71 -= ((int32_t) vasum7) * vb_zero_point;
-      vacc72 += va70 * vb02;
-      vacc72 += va71 * vb12;
-      vacc72 += va72 * vb22;
-      vacc72 += va73 * vb32;
-      vacc72 -= ((int32_t) vasum7) * vb_zero_point;
-      vacc73 += va70 * vb03;
-      vacc73 += va71 * vb13;
-      vacc73 += va72 * vb23;
-      vacc73 += va73 * vb33;
-      vacc73 -= ((int32_t) vasum7) * vb_zero_point;
-      vacc74 += va70 * vb04;
-      vacc74 += va71 * vb14;
-      vacc74 += va72 * vb24;
-      vacc74 += va73 * vb34;
-      vacc74 -= ((int32_t) vasum7) * vb_zero_point;
-      vacc75 += va70 * vb05;
-      vacc75 += va71 * vb15;
-      vacc75 += va72 * vb25;
-      vacc75 += va73 * vb35;
-      vacc75 -= ((int32_t) vasum7) * vb_zero_point;
-      vacc76 += va70 * vb06;
-      vacc76 += va71 * vb16;
-      vacc76 += va72 * vb26;
-      vacc76 += va73 * vb36;
-      vacc76 -= ((int32_t) vasum7) * vb_zero_point;
-      vacc77 += va70 * vb07;
-      vacc77 += va71 * vb17;
-      vacc77 += va72 * vb27;
-      vacc77 += va73 * vb37;
-      vacc77 -= ((int32_t) vasum7) * vb_zero_point;
-
-      k += 4 * sizeof(uint8_t);
-    }
-    // End of accumulation loop. The variable `k` contains the amount by which
-    // we advanced the `va` pointers, so we rewind by this amount now.
-    a0 = (const uint8_t*)((uintptr_t)a0 - k);
-    a1 = (const uint8_t*)((uintptr_t)a1 - k);
-    a2 = (const uint8_t*)((uintptr_t)a2 - k);
-    a3 = (const uint8_t*)((uintptr_t)a3 - k);
-    a4 = (const uint8_t*)((uintptr_t)a4 - k);
-    a5 = (const uint8_t*)((uintptr_t)a5 - k);
-    a6 = (const uint8_t*)((uintptr_t)a6 - k);
-    a7 = (const uint8_t*)((uintptr_t)a7 - k);
-
-    // Post-accumulation work
-
-    const int32_t vmultiplier = params->scalar.multiplier;
-    const int64_t vq31rounding = INT64_C(0x40000000);
-    const int32_t vremainder_mask = params->scalar.remainder_mask;
-    const uint32_t vshift = params->scalar.shift;
-    const int32_t vremainder_threshold = params->scalar.remainder_threshold;
-    const int32_t voutput_min = params->scalar.output_min_less_zero_point;
-    const int32_t voutput_max = params->scalar.output_max_less_zero_point;
-    const int32_t voutput_zero_point = params->scalar.output_zero_point;
-
-    const int64_t vproduct00 = (int64_t)vacc00 * (int64_t)vmultiplier;
-    const int64_t vproduct01 = (int64_t)vacc01 * (int64_t)vmultiplier;
-    const int64_t vproduct02 = (int64_t)vacc02 * (int64_t)vmultiplier;
-    const int64_t vproduct03 = (int64_t)vacc03 * (int64_t)vmultiplier;
-    const int64_t vproduct04 = (int64_t)vacc04 * (int64_t)vmultiplier;
-    const int64_t vproduct05 = (int64_t)vacc05 * (int64_t)vmultiplier;
-    const int64_t vproduct06 = (int64_t)vacc06 * (int64_t)vmultiplier;
-    const int64_t vproduct07 = (int64_t)vacc07 * (int64_t)vmultiplier;
-    const int64_t vproduct10 = (int64_t)vacc10 * (int64_t)vmultiplier;
-    const int64_t vproduct11 = (int64_t)vacc11 * (int64_t)vmultiplier;
-    const int64_t vproduct12 = (int64_t)vacc12 * (int64_t)vmultiplier;
-    const int64_t vproduct13 = (int64_t)vacc13 * (int64_t)vmultiplier;
-    const int64_t vproduct14 = (int64_t)vacc14 * (int64_t)vmultiplier;
-    const int64_t vproduct15 = (int64_t)vacc15 * (int64_t)vmultiplier;
-    const int64_t vproduct16 = (int64_t)vacc16 * (int64_t)vmultiplier;
-    const int64_t vproduct17 = (int64_t)vacc17 * (int64_t)vmultiplier;
-    const int64_t vproduct20 = (int64_t)vacc20 * (int64_t)vmultiplier;
-    const int64_t vproduct21 = (int64_t)vacc21 * (int64_t)vmultiplier;
-    const int64_t vproduct22 = (int64_t)vacc22 * (int64_t)vmultiplier;
-    const int64_t vproduct23 = (int64_t)vacc23 * (int64_t)vmultiplier;
-    const int64_t vproduct24 = (int64_t)vacc24 * (int64_t)vmultiplier;
-    const int64_t vproduct25 = (int64_t)vacc25 * (int64_t)vmultiplier;
-    const int64_t vproduct26 = (int64_t)vacc26 * (int64_t)vmultiplier;
-    const int64_t vproduct27 = (int64_t)vacc27 * (int64_t)vmultiplier;
-    const int64_t vproduct30 = (int64_t)vacc30 * (int64_t)vmultiplier;
-    const int64_t vproduct31 = (int64_t)vacc31 * (int64_t)vmultiplier;
-    const int64_t vproduct32 = (int64_t)vacc32 * (int64_t)vmultiplier;
-    const int64_t vproduct33 = (int64_t)vacc33 * (int64_t)vmultiplier;
-    const int64_t vproduct34 = (int64_t)vacc34 * (int64_t)vmultiplier;
-    const int64_t vproduct35 = (int64_t)vacc35 * (int64_t)vmultiplier;
-    const int64_t vproduct36 = (int64_t)vacc36 * (int64_t)vmultiplier;
-    const int64_t vproduct37 = (int64_t)vacc37 * (int64_t)vmultiplier;
-    const int64_t vproduct40 = (int64_t)vacc40 * (int64_t)vmultiplier;
-    const int64_t vproduct41 = (int64_t)vacc41 * (int64_t)vmultiplier;
-    const int64_t vproduct42 = (int64_t)vacc42 * (int64_t)vmultiplier;
-    const int64_t vproduct43 = (int64_t)vacc43 * (int64_t)vmultiplier;
-    const int64_t vproduct44 = (int64_t)vacc44 * (int64_t)vmultiplier;
-    const int64_t vproduct45 = (int64_t)vacc45 * (int64_t)vmultiplier;
-    const int64_t vproduct46 = (int64_t)vacc46 * (int64_t)vmultiplier;
-    const int64_t vproduct47 = (int64_t)vacc47 * (int64_t)vmultiplier;
-    const int64_t vproduct50 = (int64_t)vacc50 * (int64_t)vmultiplier;
-    const int64_t vproduct51 = (int64_t)vacc51 * (int64_t)vmultiplier;
-    const int64_t vproduct52 = (int64_t)vacc52 * (int64_t)vmultiplier;
-    const int64_t vproduct53 = (int64_t)vacc53 * (int64_t)vmultiplier;
-    const int64_t vproduct54 = (int64_t)vacc54 * (int64_t)vmultiplier;
-    const int64_t vproduct55 = (int64_t)vacc55 * (int64_t)vmultiplier;
-    const int64_t vproduct56 = (int64_t)vacc56 * (int64_t)vmultiplier;
-    const int64_t vproduct57 = (int64_t)vacc57 * (int64_t)vmultiplier;
-    const int64_t vproduct60 = (int64_t)vacc60 * (int64_t)vmultiplier;
-    const int64_t vproduct61 = (int64_t)vacc61 * (int64_t)vmultiplier;
-    const int64_t vproduct62 = (int64_t)vacc62 * (int64_t)vmultiplier;
-    const int64_t vproduct63 = (int64_t)vacc63 * (int64_t)vmultiplier;
-    const int64_t vproduct64 = (int64_t)vacc64 * (int64_t)vmultiplier;
-    const int64_t vproduct65 = (int64_t)vacc65 * (int64_t)vmultiplier;
-    const int64_t vproduct66 = (int64_t)vacc66 * (int64_t)vmultiplier;
-    const int64_t vproduct67 = (int64_t)vacc67 * (int64_t)vmultiplier;
-    const int64_t vproduct70 = (int64_t)vacc70 * (int64_t)vmultiplier;
-    const int64_t vproduct71 = (int64_t)vacc71 * (int64_t)vmultiplier;
-    const int64_t vproduct72 = (int64_t)vacc72 * (int64_t)vmultiplier;
-    const int64_t vproduct73 = (int64_t)vacc73 * (int64_t)vmultiplier;
-    const int64_t vproduct74 = (int64_t)vacc74 * (int64_t)vmultiplier;
-    const int64_t vproduct75 = (int64_t)vacc75 * (int64_t)vmultiplier;
-    const int64_t vproduct76 = (int64_t)vacc76 * (int64_t)vmultiplier;
-    const int64_t vproduct77 = (int64_t)vacc77 * (int64_t)vmultiplier;
-
-    const int32_t vq31product00 = (int32_t)(uint32_t)((uint64_t)(vproduct00 + vq31rounding) >> 31);
-    const int32_t vq31product01 = (int32_t)(uint32_t)((uint64_t)(vproduct01 + vq31rounding) >> 31);
-    const int32_t vq31product02 = (int32_t)(uint32_t)((uint64_t)(vproduct02 + vq31rounding) >> 31);
-    const int32_t vq31product03 = (int32_t)(uint32_t)((uint64_t)(vproduct03 + vq31rounding) >> 31);
-    const int32_t vq31product04 = (int32_t)(uint32_t)((uint64_t)(vproduct04 + vq31rounding) >> 31);
-    const int32_t vq31product05 = (int32_t)(uint32_t)((uint64_t)(vproduct05 + vq31rounding) >> 31);
-    const int32_t vq31product06 = (int32_t)(uint32_t)((uint64_t)(vproduct06 + vq31rounding) >> 31);
-    const int32_t vq31product07 = (int32_t)(uint32_t)((uint64_t)(vproduct07 + vq31rounding) >> 31);
-    const int32_t vq31product10 = (int32_t)(uint32_t)((uint64_t)(vproduct10 + vq31rounding) >> 31);
-    const int32_t vq31product11 = (int32_t)(uint32_t)((uint64_t)(vproduct11 + vq31rounding) >> 31);
-    const int32_t vq31product12 = (int32_t)(uint32_t)((uint64_t)(vproduct12 + vq31rounding) >> 31);
-    const int32_t vq31product13 = (int32_t)(uint32_t)((uint64_t)(vproduct13 + vq31rounding) >> 31);
-    const int32_t vq31product14 = (int32_t)(uint32_t)((uint64_t)(vproduct14 + vq31rounding) >> 31);
-    const int32_t vq31product15 = (int32_t)(uint32_t)((uint64_t)(vproduct15 + vq31rounding) >> 31);
-    const int32_t vq31product16 = (int32_t)(uint32_t)((uint64_t)(vproduct16 + vq31rounding) >> 31);
-    const int32_t vq31product17 = (int32_t)(uint32_t)((uint64_t)(vproduct17 + vq31rounding) >> 31);
-    const int32_t vq31product20 = (int32_t)(uint32_t)((uint64_t)(vproduct20 + vq31rounding) >> 31);
-    const int32_t vq31product21 = (int32_t)(uint32_t)((uint64_t)(vproduct21 + vq31rounding) >> 31);
-    const int32_t vq31product22 = (int32_t)(uint32_t)((uint64_t)(vproduct22 + vq31rounding) >> 31);
-    const int32_t vq31product23 = (int32_t)(uint32_t)((uint64_t)(vproduct23 + vq31rounding) >> 31);
-    const int32_t vq31product24 = (int32_t)(uint32_t)((uint64_t)(vproduct24 + vq31rounding) >> 31);
-    const int32_t vq31product25 = (int32_t)(uint32_t)((uint64_t)(vproduct25 + vq31rounding) >> 31);
-    const int32_t vq31product26 = (int32_t)(uint32_t)((uint64_t)(vproduct26 + vq31rounding) >> 31);
-    const int32_t vq31product27 = (int32_t)(uint32_t)((uint64_t)(vproduct27 + vq31rounding) >> 31);
-    const int32_t vq31product30 = (int32_t)(uint32_t)((uint64_t)(vproduct30 + vq31rounding) >> 31);
-    const int32_t vq31product31 = (int32_t)(uint32_t)((uint64_t)(vproduct31 + vq31rounding) >> 31);
-    const int32_t vq31product32 = (int32_t)(uint32_t)((uint64_t)(vproduct32 + vq31rounding) >> 31);
-    const int32_t vq31product33 = (int32_t)(uint32_t)((uint64_t)(vproduct33 + vq31rounding) >> 31);
-    const int32_t vq31product34 = (int32_t)(uint32_t)((uint64_t)(vproduct34 + vq31rounding) >> 31);
-    const int32_t vq31product35 = (int32_t)(uint32_t)((uint64_t)(vproduct35 + vq31rounding) >> 31);
-    const int32_t vq31product36 = (int32_t)(uint32_t)((uint64_t)(vproduct36 + vq31rounding) >> 31);
-    const int32_t vq31product37 = (int32_t)(uint32_t)((uint64_t)(vproduct37 + vq31rounding) >> 31);
-    const int32_t vq31product40 = (int32_t)(uint32_t)((uint64_t)(vproduct40 + vq31rounding) >> 31);
-    const int32_t vq31product41 = (int32_t)(uint32_t)((uint64_t)(vproduct41 + vq31rounding) >> 31);
-    const int32_t vq31product42 = (int32_t)(uint32_t)((uint64_t)(vproduct42 + vq31rounding) >> 31);
-    const int32_t vq31product43 = (int32_t)(uint32_t)((uint64_t)(vproduct43 + vq31rounding) >> 31);
-    const int32_t vq31product44 = (int32_t)(uint32_t)((uint64_t)(vproduct44 + vq31rounding) >> 31);
-    const int32_t vq31product45 = (int32_t)(uint32_t)((uint64_t)(vproduct45 + vq31rounding) >> 31);
-    const int32_t vq31product46 = (int32_t)(uint32_t)((uint64_t)(vproduct46 + vq31rounding) >> 31);
-    const int32_t vq31product47 = (int32_t)(uint32_t)((uint64_t)(vproduct47 + vq31rounding) >> 31);
-    const int32_t vq31product50 = (int32_t)(uint32_t)((uint64_t)(vproduct50 + vq31rounding) >> 31);
-    const int32_t vq31product51 = (int32_t)(uint32_t)((uint64_t)(vproduct51 + vq31rounding) >> 31);
-    const int32_t vq31product52 = (int32_t)(uint32_t)((uint64_t)(vproduct52 + vq31rounding) >> 31);
-    const int32_t vq31product53 = (int32_t)(uint32_t)((uint64_t)(vproduct53 + vq31rounding) >> 31);
-    const int32_t vq31product54 = (int32_t)(uint32_t)((uint64_t)(vproduct54 + vq31rounding) >> 31);
-    const int32_t vq31product55 = (int32_t)(uint32_t)((uint64_t)(vproduct55 + vq31rounding) >> 31);
-    const int32_t vq31product56 = (int32_t)(uint32_t)((uint64_t)(vproduct56 + vq31rounding) >> 31);
-    const int32_t vq31product57 = (int32_t)(uint32_t)((uint64_t)(vproduct57 + vq31rounding) >> 31);
-    const int32_t vq31product60 = (int32_t)(uint32_t)((uint64_t)(vproduct60 + vq31rounding) >> 31);
-    const int32_t vq31product61 = (int32_t)(uint32_t)((uint64_t)(vproduct61 + vq31rounding) >> 31);
-    const int32_t vq31product62 = (int32_t)(uint32_t)((uint64_t)(vproduct62 + vq31rounding) >> 31);
-    const int32_t vq31product63 = (int32_t)(uint32_t)((uint64_t)(vproduct63 + vq31rounding) >> 31);
-    const int32_t vq31product64 = (int32_t)(uint32_t)((uint64_t)(vproduct64 + vq31rounding) >> 31);
-    const int32_t vq31product65 = (int32_t)(uint32_t)((uint64_t)(vproduct65 + vq31rounding) >> 31);
-    const int32_t vq31product66 = (int32_t)(uint32_t)((uint64_t)(vproduct66 + vq31rounding) >> 31);
-    const int32_t vq31product67 = (int32_t)(uint32_t)((uint64_t)(vproduct67 + vq31rounding) >> 31);
-    const int32_t vq31product70 = (int32_t)(uint32_t)((uint64_t)(vproduct70 + vq31rounding) >> 31);
-    const int32_t vq31product71 = (int32_t)(uint32_t)((uint64_t)(vproduct71 + vq31rounding) >> 31);
-    const int32_t vq31product72 = (int32_t)(uint32_t)((uint64_t)(vproduct72 + vq31rounding) >> 31);
-    const int32_t vq31product73 = (int32_t)(uint32_t)((uint64_t)(vproduct73 + vq31rounding) >> 31);
-    const int32_t vq31product74 = (int32_t)(uint32_t)((uint64_t)(vproduct74 + vq31rounding) >> 31);
-    const int32_t vq31product75 = (int32_t)(uint32_t)((uint64_t)(vproduct75 + vq31rounding) >> 31);
-    const int32_t vq31product76 = (int32_t)(uint32_t)((uint64_t)(vproduct76 + vq31rounding) >> 31);
-    const int32_t vq31product77 = (int32_t)(uint32_t)((uint64_t)(vproduct77 + vq31rounding) >> 31);
-
-    const int32_t vremainder00 = (vq31product00 & vremainder_mask) - (int32_t)(vq31product00 < 0);
-    const int32_t vremainder01 = (vq31product01 & vremainder_mask) - (int32_t)(vq31product01 < 0);
-    const int32_t vremainder02 = (vq31product02 & vremainder_mask) - (int32_t)(vq31product02 < 0);
-    const int32_t vremainder03 = (vq31product03 & vremainder_mask) - (int32_t)(vq31product03 < 0);
-    const int32_t vremainder04 = (vq31product04 & vremainder_mask) - (int32_t)(vq31product04 < 0);
-    const int32_t vremainder05 = (vq31product05 & vremainder_mask) - (int32_t)(vq31product05 < 0);
-    const int32_t vremainder06 = (vq31product06 & vremainder_mask) - (int32_t)(vq31product06 < 0);
-    const int32_t vremainder07 = (vq31product07 & vremainder_mask) - (int32_t)(vq31product07 < 0);
-    const int32_t vremainder10 = (vq31product10 & vremainder_mask) - (int32_t)(vq31product10 < 0);
-    const int32_t vremainder11 = (vq31product11 & vremainder_mask) - (int32_t)(vq31product11 < 0);
-    const int32_t vremainder12 = (vq31product12 & vremainder_mask) - (int32_t)(vq31product12 < 0);
-    const int32_t vremainder13 = (vq31product13 & vremainder_mask) - (int32_t)(vq31product13 < 0);
-    const int32_t vremainder14 = (vq31product14 & vremainder_mask) - (int32_t)(vq31product14 < 0);
-    const int32_t vremainder15 = (vq31product15 & vremainder_mask) - (int32_t)(vq31product15 < 0);
-    const int32_t vremainder16 = (vq31product16 & vremainder_mask) - (int32_t)(vq31product16 < 0);
-    const int32_t vremainder17 = (vq31product17 & vremainder_mask) - (int32_t)(vq31product17 < 0);
-    const int32_t vremainder20 = (vq31product20 & vremainder_mask) - (int32_t)(vq31product20 < 0);
-    const int32_t vremainder21 = (vq31product21 & vremainder_mask) - (int32_t)(vq31product21 < 0);
-    const int32_t vremainder22 = (vq31product22 & vremainder_mask) - (int32_t)(vq31product22 < 0);
-    const int32_t vremainder23 = (vq31product23 & vremainder_mask) - (int32_t)(vq31product23 < 0);
-    const int32_t vremainder24 = (vq31product24 & vremainder_mask) - (int32_t)(vq31product24 < 0);
-    const int32_t vremainder25 = (vq31product25 & vremainder_mask) - (int32_t)(vq31product25 < 0);
-    const int32_t vremainder26 = (vq31product26 & vremainder_mask) - (int32_t)(vq31product26 < 0);
-    const int32_t vremainder27 = (vq31product27 & vremainder_mask) - (int32_t)(vq31product27 < 0);
-    const int32_t vremainder30 = (vq31product30 & vremainder_mask) - (int32_t)(vq31product30 < 0);
-    const int32_t vremainder31 = (vq31product31 & vremainder_mask) - (int32_t)(vq31product31 < 0);
-    const int32_t vremainder32 = (vq31product32 & vremainder_mask) - (int32_t)(vq31product32 < 0);
-    const int32_t vremainder33 = (vq31product33 & vremainder_mask) - (int32_t)(vq31product33 < 0);
-    const int32_t vremainder34 = (vq31product34 & vremainder_mask) - (int32_t)(vq31product34 < 0);
-    const int32_t vremainder35 = (vq31product35 & vremainder_mask) - (int32_t)(vq31product35 < 0);
-    const int32_t vremainder36 = (vq31product36 & vremainder_mask) - (int32_t)(vq31product36 < 0);
-    const int32_t vremainder37 = (vq31product37 & vremainder_mask) - (int32_t)(vq31product37 < 0);
-    const int32_t vremainder40 = (vq31product40 & vremainder_mask) - (int32_t)(vq31product40 < 0);
-    const int32_t vremainder41 = (vq31product41 & vremainder_mask) - (int32_t)(vq31product41 < 0);
-    const int32_t vremainder42 = (vq31product42 & vremainder_mask) - (int32_t)(vq31product42 < 0);
-    const int32_t vremainder43 = (vq31product43 & vremainder_mask) - (int32_t)(vq31product43 < 0);
-    const int32_t vremainder44 = (vq31product44 & vremainder_mask) - (int32_t)(vq31product44 < 0);
-    const int32_t vremainder45 = (vq31product45 & vremainder_mask) - (int32_t)(vq31product45 < 0);
-    const int32_t vremainder46 = (vq31product46 & vremainder_mask) - (int32_t)(vq31product46 < 0);
-    const int32_t vremainder47 = (vq31product47 & vremainder_mask) - (int32_t)(vq31product47 < 0);
-    const int32_t vremainder50 = (vq31product50 & vremainder_mask) - (int32_t)(vq31product50 < 0);
-    const int32_t vremainder51 = (vq31product51 & vremainder_mask) - (int32_t)(vq31product51 < 0);
-    const int32_t vremainder52 = (vq31product52 & vremainder_mask) - (int32_t)(vq31product52 < 0);
-    const int32_t vremainder53 = (vq31product53 & vremainder_mask) - (int32_t)(vq31product53 < 0);
-    const int32_t vremainder54 = (vq31product54 & vremainder_mask) - (int32_t)(vq31product54 < 0);
-    const int32_t vremainder55 = (vq31product55 & vremainder_mask) - (int32_t)(vq31product55 < 0);
-    const int32_t vremainder56 = (vq31product56 & vremainder_mask) - (int32_t)(vq31product56 < 0);
-    const int32_t vremainder57 = (vq31product57 & vremainder_mask) - (int32_t)(vq31product57 < 0);
-    const int32_t vremainder60 = (vq31product60 & vremainder_mask) - (int32_t)(vq31product60 < 0);
-    const int32_t vremainder61 = (vq31product61 & vremainder_mask) - (int32_t)(vq31product61 < 0);
-    const int32_t vremainder62 = (vq31product62 & vremainder_mask) - (int32_t)(vq31product62 < 0);
-    const int32_t vremainder63 = (vq31product63 & vremainder_mask) - (int32_t)(vq31product63 < 0);
-    const int32_t vremainder64 = (vq31product64 & vremainder_mask) - (int32_t)(vq31product64 < 0);
-    const int32_t vremainder65 = (vq31product65 & vremainder_mask) - (int32_t)(vq31product65 < 0);
-    const int32_t vremainder66 = (vq31product66 & vremainder_mask) - (int32_t)(vq31product66 < 0);
-    const int32_t vremainder67 = (vq31product67 & vremainder_mask) - (int32_t)(vq31product67 < 0);
-    const int32_t vremainder70 = (vq31product70 & vremainder_mask) - (int32_t)(vq31product70 < 0);
-    const int32_t vremainder71 = (vq31product71 & vremainder_mask) - (int32_t)(vq31product71 < 0);
-    const int32_t vremainder72 = (vq31product72 & vremainder_mask) - (int32_t)(vq31product72 < 0);
-    const int32_t vremainder73 = (vq31product73 & vremainder_mask) - (int32_t)(vq31product73 < 0);
-    const int32_t vremainder74 = (vq31product74 & vremainder_mask) - (int32_t)(vq31product74 < 0);
-    const int32_t vremainder75 = (vq31product75 & vremainder_mask) - (int32_t)(vq31product75 < 0);
-    const int32_t vremainder76 = (vq31product76 & vremainder_mask) - (int32_t)(vq31product76 < 0);
-    const int32_t vremainder77 = (vq31product77 & vremainder_mask) - (int32_t)(vq31product77 < 0);
-
-    int32_t vout00 = asr_s32(vq31product00, vshift) + (int32_t)(vremainder00 > vremainder_threshold);
-    int32_t vout01 = asr_s32(vq31product01, vshift) + (int32_t)(vremainder01 > vremainder_threshold);
-    int32_t vout02 = asr_s32(vq31product02, vshift) + (int32_t)(vremainder02 > vremainder_threshold);
-    int32_t vout03 = asr_s32(vq31product03, vshift) + (int32_t)(vremainder03 > vremainder_threshold);
-    int32_t vout04 = asr_s32(vq31product04, vshift) + (int32_t)(vremainder04 > vremainder_threshold);
-    int32_t vout05 = asr_s32(vq31product05, vshift) + (int32_t)(vremainder05 > vremainder_threshold);
-    int32_t vout06 = asr_s32(vq31product06, vshift) + (int32_t)(vremainder06 > vremainder_threshold);
-    int32_t vout07 = asr_s32(vq31product07, vshift) + (int32_t)(vremainder07 > vremainder_threshold);
-    int32_t vout10 = asr_s32(vq31product10, vshift) + (int32_t)(vremainder10 > vremainder_threshold);
-    int32_t vout11 = asr_s32(vq31product11, vshift) + (int32_t)(vremainder11 > vremainder_threshold);
-    int32_t vout12 = asr_s32(vq31product12, vshift) + (int32_t)(vremainder12 > vremainder_threshold);
-    int32_t vout13 = asr_s32(vq31product13, vshift) + (int32_t)(vremainder13 > vremainder_threshold);
-    int32_t vout14 = asr_s32(vq31product14, vshift) + (int32_t)(vremainder14 > vremainder_threshold);
-    int32_t vout15 = asr_s32(vq31product15, vshift) + (int32_t)(vremainder15 > vremainder_threshold);
-    int32_t vout16 = asr_s32(vq31product16, vshift) + (int32_t)(vremainder16 > vremainder_threshold);
-    int32_t vout17 = asr_s32(vq31product17, vshift) + (int32_t)(vremainder17 > vremainder_threshold);
-    int32_t vout20 = asr_s32(vq31product20, vshift) + (int32_t)(vremainder20 > vremainder_threshold);
-    int32_t vout21 = asr_s32(vq31product21, vshift) + (int32_t)(vremainder21 > vremainder_threshold);
-    int32_t vout22 = asr_s32(vq31product22, vshift) + (int32_t)(vremainder22 > vremainder_threshold);
-    int32_t vout23 = asr_s32(vq31product23, vshift) + (int32_t)(vremainder23 > vremainder_threshold);
-    int32_t vout24 = asr_s32(vq31product24, vshift) + (int32_t)(vremainder24 > vremainder_threshold);
-    int32_t vout25 = asr_s32(vq31product25, vshift) + (int32_t)(vremainder25 > vremainder_threshold);
-    int32_t vout26 = asr_s32(vq31product26, vshift) + (int32_t)(vremainder26 > vremainder_threshold);
-    int32_t vout27 = asr_s32(vq31product27, vshift) + (int32_t)(vremainder27 > vremainder_threshold);
-    int32_t vout30 = asr_s32(vq31product30, vshift) + (int32_t)(vremainder30 > vremainder_threshold);
-    int32_t vout31 = asr_s32(vq31product31, vshift) + (int32_t)(vremainder31 > vremainder_threshold);
-    int32_t vout32 = asr_s32(vq31product32, vshift) + (int32_t)(vremainder32 > vremainder_threshold);
-    int32_t vout33 = asr_s32(vq31product33, vshift) + (int32_t)(vremainder33 > vremainder_threshold);
-    int32_t vout34 = asr_s32(vq31product34, vshift) + (int32_t)(vremainder34 > vremainder_threshold);
-    int32_t vout35 = asr_s32(vq31product35, vshift) + (int32_t)(vremainder35 > vremainder_threshold);
-    int32_t vout36 = asr_s32(vq31product36, vshift) + (int32_t)(vremainder36 > vremainder_threshold);
-    int32_t vout37 = asr_s32(vq31product37, vshift) + (int32_t)(vremainder37 > vremainder_threshold);
-    int32_t vout40 = asr_s32(vq31product40, vshift) + (int32_t)(vremainder40 > vremainder_threshold);
-    int32_t vout41 = asr_s32(vq31product41, vshift) + (int32_t)(vremainder41 > vremainder_threshold);
-    int32_t vout42 = asr_s32(vq31product42, vshift) + (int32_t)(vremainder42 > vremainder_threshold);
-    int32_t vout43 = asr_s32(vq31product43, vshift) + (int32_t)(vremainder43 > vremainder_threshold);
-    int32_t vout44 = asr_s32(vq31product44, vshift) + (int32_t)(vremainder44 > vremainder_threshold);
-    int32_t vout45 = asr_s32(vq31product45, vshift) + (int32_t)(vremainder45 > vremainder_threshold);
-    int32_t vout46 = asr_s32(vq31product46, vshift) + (int32_t)(vremainder46 > vremainder_threshold);
-    int32_t vout47 = asr_s32(vq31product47, vshift) + (int32_t)(vremainder47 > vremainder_threshold);
-    int32_t vout50 = asr_s32(vq31product50, vshift) + (int32_t)(vremainder50 > vremainder_threshold);
-    int32_t vout51 = asr_s32(vq31product51, vshift) + (int32_t)(vremainder51 > vremainder_threshold);
-    int32_t vout52 = asr_s32(vq31product52, vshift) + (int32_t)(vremainder52 > vremainder_threshold);
-    int32_t vout53 = asr_s32(vq31product53, vshift) + (int32_t)(vremainder53 > vremainder_threshold);
-    int32_t vout54 = asr_s32(vq31product54, vshift) + (int32_t)(vremainder54 > vremainder_threshold);
-    int32_t vout55 = asr_s32(vq31product55, vshift) + (int32_t)(vremainder55 > vremainder_threshold);
-    int32_t vout56 = asr_s32(vq31product56, vshift) + (int32_t)(vremainder56 > vremainder_threshold);
-    int32_t vout57 = asr_s32(vq31product57, vshift) + (int32_t)(vremainder57 > vremainder_threshold);
-    int32_t vout60 = asr_s32(vq31product60, vshift) + (int32_t)(vremainder60 > vremainder_threshold);
-    int32_t vout61 = asr_s32(vq31product61, vshift) + (int32_t)(vremainder61 > vremainder_threshold);
-    int32_t vout62 = asr_s32(vq31product62, vshift) + (int32_t)(vremainder62 > vremainder_threshold);
-    int32_t vout63 = asr_s32(vq31product63, vshift) + (int32_t)(vremainder63 > vremainder_threshold);
-    int32_t vout64 = asr_s32(vq31product64, vshift) + (int32_t)(vremainder64 > vremainder_threshold);
-    int32_t vout65 = asr_s32(vq31product65, vshift) + (int32_t)(vremainder65 > vremainder_threshold);
-    int32_t vout66 = asr_s32(vq31product66, vshift) + (int32_t)(vremainder66 > vremainder_threshold);
-    int32_t vout67 = asr_s32(vq31product67, vshift) + (int32_t)(vremainder67 > vremainder_threshold);
-    int32_t vout70 = asr_s32(vq31product70, vshift) + (int32_t)(vremainder70 > vremainder_threshold);
-    int32_t vout71 = asr_s32(vq31product71, vshift) + (int32_t)(vremainder71 > vremainder_threshold);
-    int32_t vout72 = asr_s32(vq31product72, vshift) + (int32_t)(vremainder72 > vremainder_threshold);
-    int32_t vout73 = asr_s32(vq31product73, vshift) + (int32_t)(vremainder73 > vremainder_threshold);
-    int32_t vout74 = asr_s32(vq31product74, vshift) + (int32_t)(vremainder74 > vremainder_threshold);
-    int32_t vout75 = asr_s32(vq31product75, vshift) + (int32_t)(vremainder75 > vremainder_threshold);
-    int32_t vout76 = asr_s32(vq31product76, vshift) + (int32_t)(vremainder76 > vremainder_threshold);
-    int32_t vout77 = asr_s32(vq31product77, vshift) + (int32_t)(vremainder77 > vremainder_threshold);
-
-    vout00 = vout00 < voutput_min ? voutput_min : vout00;
-    vout01 = vout01 < voutput_min ? voutput_min : vout01;
-    vout02 = vout02 < voutput_min ? voutput_min : vout02;
-    vout03 = vout03 < voutput_min ? voutput_min : vout03;
-    vout04 = vout04 < voutput_min ? voutput_min : vout04;
-    vout05 = vout05 < voutput_min ? voutput_min : vout05;
-    vout06 = vout06 < voutput_min ? voutput_min : vout06;
-    vout07 = vout07 < voutput_min ? voutput_min : vout07;
-    vout10 = vout10 < voutput_min ? voutput_min : vout10;
-    vout11 = vout11 < voutput_min ? voutput_min : vout11;
-    vout12 = vout12 < voutput_min ? voutput_min : vout12;
-    vout13 = vout13 < voutput_min ? voutput_min : vout13;
-    vout14 = vout14 < voutput_min ? voutput_min : vout14;
-    vout15 = vout15 < voutput_min ? voutput_min : vout15;
-    vout16 = vout16 < voutput_min ? voutput_min : vout16;
-    vout17 = vout17 < voutput_min ? voutput_min : vout17;
-    vout20 = vout20 < voutput_min ? voutput_min : vout20;
-    vout21 = vout21 < voutput_min ? voutput_min : vout21;
-    vout22 = vout22 < voutput_min ? voutput_min : vout22;
-    vout23 = vout23 < voutput_min ? voutput_min : vout23;
-    vout24 = vout24 < voutput_min ? voutput_min : vout24;
-    vout25 = vout25 < voutput_min ? voutput_min : vout25;
-    vout26 = vout26 < voutput_min ? voutput_min : vout26;
-    vout27 = vout27 < voutput_min ? voutput_min : vout27;
-    vout30 = vout30 < voutput_min ? voutput_min : vout30;
-    vout31 = vout31 < voutput_min ? voutput_min : vout31;
-    vout32 = vout32 < voutput_min ? voutput_min : vout32;
-    vout33 = vout33 < voutput_min ? voutput_min : vout33;
-    vout34 = vout34 < voutput_min ? voutput_min : vout34;
-    vout35 = vout35 < voutput_min ? voutput_min : vout35;
-    vout36 = vout36 < voutput_min ? voutput_min : vout36;
-    vout37 = vout37 < voutput_min ? voutput_min : vout37;
-    vout40 = vout40 < voutput_min ? voutput_min : vout40;
-    vout41 = vout41 < voutput_min ? voutput_min : vout41;
-    vout42 = vout42 < voutput_min ? voutput_min : vout42;
-    vout43 = vout43 < voutput_min ? voutput_min : vout43;
-    vout44 = vout44 < voutput_min ? voutput_min : vout44;
-    vout45 = vout45 < voutput_min ? voutput_min : vout45;
-    vout46 = vout46 < voutput_min ? voutput_min : vout46;
-    vout47 = vout47 < voutput_min ? voutput_min : vout47;
-    vout50 = vout50 < voutput_min ? voutput_min : vout50;
-    vout51 = vout51 < voutput_min ? voutput_min : vout51;
-    vout52 = vout52 < voutput_min ? voutput_min : vout52;
-    vout53 = vout53 < voutput_min ? voutput_min : vout53;
-    vout54 = vout54 < voutput_min ? voutput_min : vout54;
-    vout55 = vout55 < voutput_min ? voutput_min : vout55;
-    vout56 = vout56 < voutput_min ? voutput_min : vout56;
-    vout57 = vout57 < voutput_min ? voutput_min : vout57;
-    vout60 = vout60 < voutput_min ? voutput_min : vout60;
-    vout61 = vout61 < voutput_min ? voutput_min : vout61;
-    vout62 = vout62 < voutput_min ? voutput_min : vout62;
-    vout63 = vout63 < voutput_min ? voutput_min : vout63;
-    vout64 = vout64 < voutput_min ? voutput_min : vout64;
-    vout65 = vout65 < voutput_min ? voutput_min : vout65;
-    vout66 = vout66 < voutput_min ? voutput_min : vout66;
-    vout67 = vout67 < voutput_min ? voutput_min : vout67;
-    vout70 = vout70 < voutput_min ? voutput_min : vout70;
-    vout71 = vout71 < voutput_min ? voutput_min : vout71;
-    vout72 = vout72 < voutput_min ? voutput_min : vout72;
-    vout73 = vout73 < voutput_min ? voutput_min : vout73;
-    vout74 = vout74 < voutput_min ? voutput_min : vout74;
-    vout75 = vout75 < voutput_min ? voutput_min : vout75;
-    vout76 = vout76 < voutput_min ? voutput_min : vout76;
-    vout77 = vout77 < voutput_min ? voutput_min : vout77;
-
-    vout00 = vout00 > voutput_max ? voutput_max : vout00;
-    vout01 = vout01 > voutput_max ? voutput_max : vout01;
-    vout02 = vout02 > voutput_max ? voutput_max : vout02;
-    vout03 = vout03 > voutput_max ? voutput_max : vout03;
-    vout04 = vout04 > voutput_max ? voutput_max : vout04;
-    vout05 = vout05 > voutput_max ? voutput_max : vout05;
-    vout06 = vout06 > voutput_max ? voutput_max : vout06;
-    vout07 = vout07 > voutput_max ? voutput_max : vout07;
-    vout10 = vout10 > voutput_max ? voutput_max : vout10;
-    vout11 = vout11 > voutput_max ? voutput_max : vout11;
-    vout12 = vout12 > voutput_max ? voutput_max : vout12;
-    vout13 = vout13 > voutput_max ? voutput_max : vout13;
-    vout14 = vout14 > voutput_max ? voutput_max : vout14;
-    vout15 = vout15 > voutput_max ? voutput_max : vout15;
-    vout16 = vout16 > voutput_max ? voutput_max : vout16;
-    vout17 = vout17 > voutput_max ? voutput_max : vout17;
-    vout20 = vout20 > voutput_max ? voutput_max : vout20;
-    vout21 = vout21 > voutput_max ? voutput_max : vout21;
-    vout22 = vout22 > voutput_max ? voutput_max : vout22;
-    vout23 = vout23 > voutput_max ? voutput_max : vout23;
-    vout24 = vout24 > voutput_max ? voutput_max : vout24;
-    vout25 = vout25 > voutput_max ? voutput_max : vout25;
-    vout26 = vout26 > voutput_max ? voutput_max : vout26;
-    vout27 = vout27 > voutput_max ? voutput_max : vout27;
-    vout30 = vout30 > voutput_max ? voutput_max : vout30;
-    vout31 = vout31 > voutput_max ? voutput_max : vout31;
-    vout32 = vout32 > voutput_max ? voutput_max : vout32;
-    vout33 = vout33 > voutput_max ? voutput_max : vout33;
-    vout34 = vout34 > voutput_max ? voutput_max : vout34;
-    vout35 = vout35 > voutput_max ? voutput_max : vout35;
-    vout36 = vout36 > voutput_max ? voutput_max : vout36;
-    vout37 = vout37 > voutput_max ? voutput_max : vout37;
-    vout40 = vout40 > voutput_max ? voutput_max : vout40;
-    vout41 = vout41 > voutput_max ? voutput_max : vout41;
-    vout42 = vout42 > voutput_max ? voutput_max : vout42;
-    vout43 = vout43 > voutput_max ? voutput_max : vout43;
-    vout44 = vout44 > voutput_max ? voutput_max : vout44;
-    vout45 = vout45 > voutput_max ? voutput_max : vout45;
-    vout46 = vout46 > voutput_max ? voutput_max : vout46;
-    vout47 = vout47 > voutput_max ? voutput_max : vout47;
-    vout50 = vout50 > voutput_max ? voutput_max : vout50;
-    vout51 = vout51 > voutput_max ? voutput_max : vout51;
-    vout52 = vout52 > voutput_max ? voutput_max : vout52;
-    vout53 = vout53 > voutput_max ? voutput_max : vout53;
-    vout54 = vout54 > voutput_max ? voutput_max : vout54;
-    vout55 = vout55 > voutput_max ? voutput_max : vout55;
-    vout56 = vout56 > voutput_max ? voutput_max : vout56;
-    vout57 = vout57 > voutput_max ? voutput_max : vout57;
-    vout60 = vout60 > voutput_max ? voutput_max : vout60;
-    vout61 = vout61 > voutput_max ? voutput_max : vout61;
-    vout62 = vout62 > voutput_max ? voutput_max : vout62;
-    vout63 = vout63 > voutput_max ? voutput_max : vout63;
-    vout64 = vout64 > voutput_max ? voutput_max : vout64;
-    vout65 = vout65 > voutput_max ? voutput_max : vout65;
-    vout66 = vout66 > voutput_max ? voutput_max : vout66;
-    vout67 = vout67 > voutput_max ? voutput_max : vout67;
-    vout70 = vout70 > voutput_max ? voutput_max : vout70;
-    vout71 = vout71 > voutput_max ? voutput_max : vout71;
-    vout72 = vout72 > voutput_max ? voutput_max : vout72;
-    vout73 = vout73 > voutput_max ? voutput_max : vout73;
-    vout74 = vout74 > voutput_max ? voutput_max : vout74;
-    vout75 = vout75 > voutput_max ? voutput_max : vout75;
-    vout76 = vout76 > voutput_max ? voutput_max : vout76;
-    vout77 = vout77 > voutput_max ? voutput_max : vout77;
-
-    vout00 += voutput_zero_point;
-    vout01 += voutput_zero_point;
-    vout02 += voutput_zero_point;
-    vout03 += voutput_zero_point;
-    vout04 += voutput_zero_point;
-    vout05 += voutput_zero_point;
-    vout06 += voutput_zero_point;
-    vout07 += voutput_zero_point;
-    vout10 += voutput_zero_point;
-    vout11 += voutput_zero_point;
-    vout12 += voutput_zero_point;
-    vout13 += voutput_zero_point;
-    vout14 += voutput_zero_point;
-    vout15 += voutput_zero_point;
-    vout16 += voutput_zero_point;
-    vout17 += voutput_zero_point;
-    vout20 += voutput_zero_point;
-    vout21 += voutput_zero_point;
-    vout22 += voutput_zero_point;
-    vout23 += voutput_zero_point;
-    vout24 += voutput_zero_point;
-    vout25 += voutput_zero_point;
-    vout26 += voutput_zero_point;
-    vout27 += voutput_zero_point;
-    vout30 += voutput_zero_point;
-    vout31 += voutput_zero_point;
-    vout32 += voutput_zero_point;
-    vout33 += voutput_zero_point;
-    vout34 += voutput_zero_point;
-    vout35 += voutput_zero_point;
-    vout36 += voutput_zero_point;
-    vout37 += voutput_zero_point;
-    vout40 += voutput_zero_point;
-    vout41 += voutput_zero_point;
-    vout42 += voutput_zero_point;
-    vout43 += voutput_zero_point;
-    vout44 += voutput_zero_point;
-    vout45 += voutput_zero_point;
-    vout46 += voutput_zero_point;
-    vout47 += voutput_zero_point;
-    vout50 += voutput_zero_point;
-    vout51 += voutput_zero_point;
-    vout52 += voutput_zero_point;
-    vout53 += voutput_zero_point;
-    vout54 += voutput_zero_point;
-    vout55 += voutput_zero_point;
-    vout56 += voutput_zero_point;
-    vout57 += voutput_zero_point;
-    vout60 += voutput_zero_point;
-    vout61 += voutput_zero_point;
-    vout62 += voutput_zero_point;
-    vout63 += voutput_zero_point;
-    vout64 += voutput_zero_point;
-    vout65 += voutput_zero_point;
-    vout66 += voutput_zero_point;
-    vout67 += voutput_zero_point;
-    vout70 += voutput_zero_point;
-    vout71 += voutput_zero_point;
-    vout72 += voutput_zero_point;
-    vout73 += voutput_zero_point;
-    vout74 += voutput_zero_point;
-    vout75 += voutput_zero_point;
-    vout76 += voutput_zero_point;
-    vout77 += voutput_zero_point;
-
-    if XNN_LIKELY (nc >= 8) {
-      // Main case where there the 8 columns fit in the destination.
-      c0[0] = vout00;
-      c0[1] = vout01;
-      c0[2] = vout02;
-      c0[3] = vout03;
-      c0[4] = vout04;
-      c0[5] = vout05;
-      c0[6] = vout06;
-      c0[7] = vout07;
-      c1[0] = vout10;
-      c1[1] = vout11;
-      c1[2] = vout12;
-      c1[3] = vout13;
-      c1[4] = vout14;
-      c1[5] = vout15;
-      c1[6] = vout16;
-      c1[7] = vout17;
-      c2[0] = vout20;
-      c2[1] = vout21;
-      c2[2] = vout22;
-      c2[3] = vout23;
-      c2[4] = vout24;
-      c2[5] = vout25;
-      c2[6] = vout26;
-      c2[7] = vout27;
-      c3[0] = vout30;
-      c3[1] = vout31;
-      c3[2] = vout32;
-      c3[3] = vout33;
-      c3[4] = vout34;
-      c3[5] = vout35;
-      c3[6] = vout36;
-      c3[7] = vout37;
-      c4[0] = vout40;
-      c4[1] = vout41;
-      c4[2] = vout42;
-      c4[3] = vout43;
-      c4[4] = vout44;
-      c4[5] = vout45;
-      c4[6] = vout46;
-      c4[7] = vout47;
-      c5[0] = vout50;
-      c5[1] = vout51;
-      c5[2] = vout52;
-      c5[3] = vout53;
-      c5[4] = vout54;
-      c5[5] = vout55;
-      c5[6] = vout56;
-      c5[7] = vout57;
-      c6[0] = vout60;
-      c6[1] = vout61;
-      c6[2] = vout62;
-      c6[3] = vout63;
-      c6[4] = vout64;
-      c6[5] = vout65;
-      c6[6] = vout66;
-      c6[7] = vout67;
-      c7[0] = vout70;
-      c7[1] = vout71;
-      c7[2] = vout72;
-      c7[3] = vout73;
-      c7[4] = vout74;
-      c7[5] = vout75;
-      c7[6] = vout76;
-      c7[7] = vout77;
-
-      // Advance to the next 8 columns.
-      c0 = (uint8_t*)((uintptr_t)c0 + cn_stride);
-      c1 = (uint8_t*)((uintptr_t)c1 + cn_stride);
-      c2 = (uint8_t*)((uintptr_t)c2 + cn_stride);
-      c3 = (uint8_t*)((uintptr_t)c3 + cn_stride);
-      c4 = (uint8_t*)((uintptr_t)c4 + cn_stride);
-      c5 = (uint8_t*)((uintptr_t)c5 + cn_stride);
-      c6 = (uint8_t*)((uintptr_t)c6 + cn_stride);
-      c7 = (uint8_t*)((uintptr_t)c7 + cn_stride);
-
-      nc -= 8;
-    } else {
-      // Final case where not all of the 8 columns fit in the destination.
-      if (nc > 0) {
-        c0[0] = vout00;
-        c1[0] = vout10;
-        c2[0] = vout20;
-        c3[0] = vout30;
-        c4[0] = vout40;
-        c5[0] = vout50;
-        c6[0] = vout60;
-        c7[0] = vout70;
-      }
-      if (nc > 1) {
-        c0[1] = vout01;
-        c1[1] = vout11;
-        c2[1] = vout21;
-        c3[1] = vout31;
-        c4[1] = vout41;
-        c5[1] = vout51;
-        c6[1] = vout61;
-        c7[1] = vout71;
-      }
-      if (nc > 2) {
-        c0[2] = vout02;
-        c1[2] = vout12;
-        c2[2] = vout22;
-        c3[2] = vout32;
-        c4[2] = vout42;
-        c5[2] = vout52;
-        c6[2] = vout62;
-        c7[2] = vout72;
-      }
-      if (nc > 3) {
-        c0[3] = vout03;
-        c1[3] = vout13;
-        c2[3] = vout23;
-        c3[3] = vout33;
-        c4[3] = vout43;
-        c5[3] = vout53;
-        c6[3] = vout63;
-        c7[3] = vout73;
-      }
-      if (nc > 4) {
-        c0[4] = vout04;
-        c1[4] = vout14;
-        c2[4] = vout24;
-        c3[4] = vout34;
-        c4[4] = vout44;
-        c5[4] = vout54;
-        c6[4] = vout64;
-        c7[4] = vout74;
-      }
-      if (nc > 5) {
-        c0[5] = vout05;
-        c1[5] = vout15;
-        c2[5] = vout25;
-        c3[5] = vout35;
-        c4[5] = vout45;
-        c5[5] = vout55;
-        c6[5] = vout65;
-        c7[5] = vout75;
-      }
-      if (nc > 6) {
-        c0[6] = vout06;
-        c1[6] = vout16;
-        c2[6] = vout26;
-        c3[6] = vout36;
-        c4[6] = vout46;
-        c5[6] = vout56;
-        c6[6] = vout66;
-        c7[6] = vout76;
-      }
-      if (nc > 7) {
-        c0[7] = vout07;
-        c1[7] = vout17;
-        c2[7] = vout27;
-        c3[7] = vout37;
-        c4[7] = vout47;
-        c5[7] = vout57;
-        c6[7] = vout67;
-        c7[7] = vout77;
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/qu8-igemm/4x4c2-minmax-sse2.c b/src/qu8-igemm/4x4c2-minmax-sse2.c
index ba7573c..cef6f81 100644
--- a/src/qu8-igemm/4x4c2-minmax-sse2.c
+++ b/src/qu8-igemm/4x4c2-minmax-sse2.c
@@ -11,6 +11,7 @@
 #include <immintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qu8_igemm_minmax_ukernel_4x4c2__sse2(
@@ -33,7 +34,12 @@
   assert(kc != 0);
   assert(ks != 0);
   assert(ks % (4 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   uint8_t* c0 = c;
   uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
@@ -163,17 +169,6 @@
             vacc1x0123 = _mm_add_epi32(vacc1x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
             vacc2x0123 = _mm_add_epi32(vacc2x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
             vacc3x0123 = _mm_add_epi32(vacc3x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-            if (k > 6 * sizeof(uint8_t)) {
-              const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-              const __m128i vxb3 = _mm_sub_epi16(_mm_unpacklo_epi8(vb3, vzero), vb_zero_point);
-              w = (void*) ((uintptr_t) w + 8);
-
-              vacc0x0123 = _mm_add_epi32(vacc0x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-              vacc1x0123 = _mm_add_epi32(vacc1x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-              vacc2x0123 = _mm_add_epi32(vacc2x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-              vacc3x0123 = _mm_add_epi32(vacc3x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            }
           }
         }
       }
diff --git a/src/x32-depthtospace2d-chw2hwc/scalar.c b/src/x32-depthtospace2d-chw2hwc/scalar.c
index 80e453b..2d1fa54 100644
--- a/src/x32-depthtospace2d-chw2hwc/scalar.c
+++ b/src/x32-depthtospace2d-chw2hwc/scalar.c
@@ -15,72 +15,23 @@
     size_t block_size,
     const uint32_t*restrict input,
     uint32_t*restrict output,
-    size_t input_channel_stride,
-    size_t input_height_stride,
-    size_t output_height_stride,
-    size_t output_width_stride)
+    size_t output_channel_stride)
 {
   assert(output_channels != 0);
   assert(input_height != 0);
   assert(input_width != 0);
   assert(block_size != 0);
 
-  // output[(iy * block_size + by) * output_height_stride +
-  //        (ix * block_size + bx) * output_width_stride +
-  //        c * element_stride] =
-  //     input[
-  //         (c * block_size * block_size + by * block_size + bx) * input_channel_stride +
-  //         iy * input_height_stride +
-  //         ix * element_stride]
-
-  const size_t element_stride = sizeof(uint32_t);
-
-  const size_t iy_output_increment = block_size * output_height_stride;
-  const size_t by_output_increment = output_height_stride;
-  const size_t ix_output_increment = block_size * output_width_stride;
-  const size_t bx_output_increment = output_width_stride;
-  const size_t c_output_increment = element_stride;
-
-  const size_t c_input_increment = block_size * block_size * input_channel_stride;
-  const size_t by_input_increment = block_size * input_channel_stride;
-  const size_t bx_input_increment = input_channel_stride;
-  const size_t iy_input_increment = input_height_stride;
-  const size_t ix_input_increment = element_stride;
-
-  size_t iy = input_height;
-  uintptr_t i_iy = (uintptr_t) input;
-  uintptr_t o_iy = (uintptr_t) output;
-  do {
-    size_t by = block_size;
-    uintptr_t i_by = i_iy;
-    uintptr_t o_by = o_iy;
-    do {
-      size_t ix = input_width;
-      uintptr_t i_ix = i_by;
-      uintptr_t o_ix = o_by;
-      do {
-        size_t bx = block_size;
-        uintptr_t i_bx = i_ix;
-        uintptr_t o_bx = o_ix;
-        do {
-          size_t c = output_channels;
-          uintptr_t i_c = i_bx;
-          uintptr_t o_c = o_bx;
-          do {
-            *(uint32_t*) o_c = *(uint32_t*) i_c;
-            i_c += c_input_increment;
-            o_c += c_output_increment;
-          } while (--c != 0);
-          i_bx += bx_input_increment;
-          o_bx += bx_output_increment;
-        } while (--bx != 0);
-        i_ix += ix_input_increment;
-        o_ix += ix_output_increment;
-      } while (--ix != 0);
-      i_by += by_input_increment;
-      o_by += by_output_increment;
-    } while (--by != 0);
-    i_iy += iy_input_increment;
-    o_iy += iy_output_increment;
-  } while (--iy != 0);
+  for (size_t iy = 0; iy < input_height; iy++) {
+    for (size_t by = 0; by < block_size; by++) {
+      for (size_t ix = 0; ix < input_width; ix++) {
+        for (size_t bx = 0; bx < block_size; bx++) {
+          for (size_t oc = 0; oc < output_channels; oc++) {
+            output[(((iy * block_size + by) * input_width + ix) * block_size + bx) * output_channel_stride + oc] =
+              input[(((by * block_size + bx) * output_channels + oc) * input_height + iy) * input_width + ix];
+          }
+        }
+      }
+    }
+  }
 }
diff --git a/src/xnnpack/compute.h b/src/xnnpack/compute.h
index 9f198e7..e76d102 100644
--- a/src/xnnpack/compute.h
+++ b/src/xnnpack/compute.h
@@ -476,10 +476,7 @@
   void* output;
   size_t input_batch_stride;
   size_t output_batch_stride;
-  size_t input_channel_stride;
-  size_t input_height_stride;
-  size_t output_height_stride;
-  size_t output_width_stride;
+  size_t output_channel_stride;
   xnn_depthtospace2d_chw2hwc_ukernel_function ukernel;
 };
 
diff --git a/src/xnnpack/depthtospace.h b/src/xnnpack/depthtospace.h
index 32d63b2..285fd3c 100644
--- a/src/xnnpack/depthtospace.h
+++ b/src/xnnpack/depthtospace.h
@@ -23,10 +23,7 @@
       size_t block_size,                                             \
       const uint32_t* input,                                         \
       uint32_t* output,                                              \
-      size_t input_channel_stride,                                   \
-      size_t input_height_stride,                                    \
-      size_t output_height_stride,                                   \
-      size_t output_width_stride);
+      size_t output_channel_stride);
 
 DECLARE_X32_DEPTHTOSPACE2D_CHW2HWC_UKERNEL_FUNCTION(xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar)
 
diff --git a/src/xnnpack/gemm.h b/src/xnnpack/gemm.h
index 0eaa05a..74c1665 100644
--- a/src/xnnpack/gemm.h
+++ b/src/xnnpack/gemm.h
@@ -500,18 +500,11 @@
 DECLARE_QU8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_gemm_minmax_ukernel_4x8__neon)
 DECLARE_QU8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_gemm_minmax_ukernel_8x8__neon)
 
-DECLARE_QU8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_gemm_minmax_ukernel_4x8__aarch32_neon)
-
-DECLARE_QU8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_gemm_minmax_ukernel_8x8__aarch64_neon)
-
 DECLARE_QU8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_gemm_minmax_ukernel_2x4c8__sse2)
 DECLARE_QU8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_gemm_minmax_ukernel_4x4c2__sse2)
 
 DECLARE_QU8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_gemm_minmax_ukernel_2x2__scalar)
 
-DECLARE_QU8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_gemm_minmax_ukernel_8x8c4__scalar)
-DECLARE_QU8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_gemm_minmax_ukernel_12x4c4__scalar)
-
 #define DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(fn_name) \
   XNN_INTERNAL void fn_name(                              \
       size_t mr,                                          \
@@ -527,24 +520,93 @@
 
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x8__neon_mlal_lane)
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane)
 
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane)
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane)
+
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup)
+
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup)
+
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mull_padal_dup)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mull_padal_dup)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mull_padal_dup)
+
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup)
+
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup)
+
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup)
+
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mull_padal)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mull_padal)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal)
+
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal)
+
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal)
+
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal)
+
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal)
+
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal)
 
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x8c4__neondot)
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x8c4__neondot)
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_6x8c4__neondot)
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_8x8c4__neondot)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_12x8c4__neondot)
 
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot)
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot)
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot)
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot)
 
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mull_padal)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal)
+
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot)
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld32)
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld64)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld32)
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64)
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55)
 
diff --git a/src/xnnpack/ibilinear.h b/src/xnnpack/ibilinear.h
index 5af6b8c..643df46 100644
--- a/src/xnnpack/ibilinear.h
+++ b/src/xnnpack/ibilinear.h
@@ -59,6 +59,12 @@
 DECLARE_F32_IBILINEAR_CHW_UKERNEL_FUNCTION(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p4)
 DECLARE_F32_IBILINEAR_CHW_UKERNEL_FUNCTION(xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8)
 
+DECLARE_F32_IBILINEAR_CHW_UKERNEL_FUNCTION(xnn_f32_ibilinear_chw_ukernel__neon_p4)
+DECLARE_F32_IBILINEAR_CHW_UKERNEL_FUNCTION(xnn_f32_ibilinear_chw_ukernel__neon_p8)
+
+DECLARE_F32_IBILINEAR_CHW_UKERNEL_FUNCTION(xnn_f32_ibilinear_chw_ukernel__neonfma_p4)
+DECLARE_F32_IBILINEAR_CHW_UKERNEL_FUNCTION(xnn_f32_ibilinear_chw_ukernel__neonfma_p8)
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/src/xnnpack/igemm.h b/src/xnnpack/igemm.h
index 7e09667..3d3d33a 100644
--- a/src/xnnpack/igemm.h
+++ b/src/xnnpack/igemm.h
@@ -329,21 +329,90 @@
 
 DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x8__neon_mlal_lane)
 DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x8__neon_mlal_lane)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane)
 
 DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane)
 DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane)
+
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup)
+
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup)
+
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mull_padal)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mull_padal)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal)
+
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal)
+
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal)
+
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal)
+
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x8c16__neon_mlal_padal)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal)
+
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal)
+
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup)
+
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup)
+
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup)
+
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup)
 
 DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x8c4__neondot)
 DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x8c4__neondot)
 DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_6x8c4__neondot)
 DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_8x8c4__neondot)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_12x8c4__neondot)
 
 DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot)
 DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot)
 DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot)
 DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot)
 
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal)
+
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55)
+
 DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse2_ld64)
 DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse2_ld64)
 
diff --git a/src/xnnpack/params.h b/src/xnnpack/params.h
index a9e3c66..18fdf53 100644
--- a/src/xnnpack/params.h
+++ b/src/xnnpack/params.h
@@ -935,10 +935,7 @@
     size_t block_size,
     const void* input,
     void* output,
-    size_t input_channel_stride,
-    size_t input_height_stride,
-    size_t output_height_stride,
-    size_t output_width_stride);
+    size_t output_channels_stride);
 
 typedef void (*xnn_x32_depthtospace2d_chw2hwc_ukernel_function)(
     size_t output_channels,
@@ -947,10 +944,7 @@
     size_t block_size,
     const uint32_t* input,
     uint32_t* output,
-    size_t input_channel_stride,
-    size_t input_height_stride,
-    size_t output_height_stride,
-    size_t output_width_stride);
+    size_t output_channel_stride);
 
 typedef void (*xnn_pad_ukernel_function)(
     size_t rows,
diff --git a/test/binary-elementwise-operator-tester.h b/test/binary-elementwise-operator-tester.h
index 7764fe6..968d807 100644
--- a/test/binary-elementwise-operator-tester.h
+++ b/test/binary-elementwise-operator-tester.h
@@ -70,7 +70,7 @@
   }
 
   inline BinaryElementwiseOperatorTester& input1_scale(float input1_scale) {
-    assert(isfinite(input1_scale));
+    assert(std::isfinite(input1_scale));
     this->input1_scale_ = input1_scale;
     return *this;
   }
@@ -112,7 +112,7 @@
   }
 
   inline BinaryElementwiseOperatorTester& input2_scale(float input2_scale) {
-    assert(isfinite(input2_scale));
+    assert(std::isfinite(input2_scale));
     this->input2_scale_ = input2_scale;
     return *this;
   }
@@ -131,7 +131,7 @@
   }
 
   inline BinaryElementwiseOperatorTester& output_scale(float output_scale) {
-    assert(isfinite(output_scale));
+    assert(std::isfinite(output_scale));
     this->output_scale_ = output_scale;
     return *this;
   }
diff --git a/test/depth-to-space-operator-tester.h b/test/depth-to-space-operator-tester.h
index c189a65..8e854bb 100644
--- a/test/depth-to-space-operator-tester.h
+++ b/test/depth-to-space-operator-tester.h
@@ -240,7 +240,7 @@
                 for (size_t oc = 0; oc < output_channels(); oc++) {
                   const size_t input_index =
                     i * input_channels_stride() * input_height() * input_width() +
-                    (((oc * block_size() + by) * block_size() + bx) * input_height() + iy) * input_width() + ix;
+                    (((by * block_size() + bx) * output_channels() + oc) * input_height() + iy) * input_width() + ix;
                   const size_t output_index =
                     ((i * output_height() + iy * block_size() + by) * output_width() + ix * block_size() + bx) *
                       output_channels_stride() + oc;
diff --git a/test/depthtospace-microkernel-tester.h b/test/depthtospace-microkernel-tester.h
index 3984c66..44042ba 100644
--- a/test/depthtospace-microkernel-tester.h
+++ b/test/depthtospace-microkernel-tester.h
@@ -74,69 +74,17 @@
     return this->block_size_;
   }
 
-  inline DepthToSpaceMicrokernelTester& element_size(size_t element_size) {
-    assert(element_size != 0);
-    this->element_size_ = element_size;
+  inline DepthToSpaceMicrokernelTester& output_channel_stride(size_t output_channel_stride) {
+    assert(output_channel_stride != 0);
+    this->output_channel_stride_ = output_channel_stride;
     return *this;
   }
 
-  inline size_t element_size() const {
-    return this->element_size_;
-  }
-
-  inline DepthToSpaceMicrokernelTester& input_channel_stride(size_t input_channel_stride) {
-    assert(input_channel_stride != 0);
-    this->input_channel_stride_ = input_channel_stride;
-    return *this;
-  }
-
-  inline size_t input_channel_stride() const {
-    if (this->input_channel_stride_ != 0) {
-      return this->input_channel_stride_;
+  inline size_t output_channel_stride() const {
+    if (this->output_channel_stride_ != 0) {
+      return this->output_channel_stride_;
     } else {
-      return this->input_height() * this->input_width() * this->element_size();
-    }
-  }
-
-  inline DepthToSpaceMicrokernelTester& input_height_stride(size_t input_height_stride) {
-    assert(input_height_stride != 0);
-    this->input_height_stride_ = input_height_stride;
-    return *this;
-  }
-
-  inline size_t input_height_stride() const {
-    if (this->input_height_stride_ != 0) {
-      return this->input_height_stride_;
-    } else {
-      return this->input_width() * this->element_size();
-    }
-  }
-
-  inline DepthToSpaceMicrokernelTester& output_height_stride(size_t output_height_stride) {
-    assert(output_height_stride != 0);
-    this->output_height_stride_ = output_height_stride;
-    return *this;
-  }
-
-  inline size_t output_height_stride() const {
-    if (this->output_height_stride_ != 0) {
-      return this->output_height_stride_;
-    } else {
-      return this->output_width() * this->output_channels() * this->element_size();
-    }
-  }
-
-  inline DepthToSpaceMicrokernelTester& output_width_stride(size_t output_width_stride) {
-    assert(output_width_stride != 0);
-    this->output_width_stride_ = output_width_stride;
-    return *this;
-  }
-
-  inline size_t output_width_stride() const {
-    if (this->output_width_stride_ != 0) {
-      return this->output_width_stride_;
-    } else {
-      return this->output_channels() * this->element_size();
+      return this->output_channels();
     }
   }
 
@@ -150,30 +98,14 @@
   }
 
   void Test(xnn_x32_depthtospace2d_chw2hwc_ukernel_function depthtospace2d) const {
-    ASSERT_EQ(element_size(), sizeof(uint32_t));
     ASSERT_GE(block_size(), 2);
-    ASSERT_GE(input_channel_stride(), input_height() * input_height_stride());
-    ASSERT_GE(input_height_stride(), input_width() * element_size());
-    ASSERT_GE(output_height_stride(), input_width() * block_size() * output_width_stride());
-    ASSERT_GE(output_width_stride(), output_channels() * element_size());
 
     std::random_device random_device;
     auto rng = std::mt19937(random_device());
     auto u32rng = std::bind(std::uniform_int_distribution<uint32_t>(), rng);
 
-    const size_t input_byte_size =
-        (input_channels() - 1) * input_channel_stride() +
-        (input_height() - 1) * input_height_stride() +
-        input_width() * element_size();
-    ASSERT_EQ(input_byte_size % element_size(), 0);
-    std::vector<uint32_t> input(input_byte_size / element_size());
-
-    const size_t output_byte_size =
-        (output_height() - 1) * output_height_stride() +
-        (output_width() - 1) * output_width_stride() +
-        output_channels() * element_size();
-    ASSERT_EQ(output_byte_size % element_size(), 0);
-    std::vector<uint32_t> output(output_byte_size / element_size());
+    std::vector<uint32_t> input(input_channels() * input_height() * input_width());
+    std::vector<uint32_t> output((output_height() * output_width() - 1) * output_channel_stride() + output_channels());
 
     for (size_t iteration = 0; iteration < iterations(); iteration++) {
       std::generate(input.begin(), input.end(), std::ref(u32rng));
@@ -186,36 +118,25 @@
         block_size(),
         input.data(),
         output.data(),
-        input_channel_stride(),
-        input_height_stride(),
-        output_height_stride(),
-        output_width_stride());
+        output_channel_stride());
 
       // Verify results.
-      for (size_t iy = 0; iy < input_height(); ++iy) {
-        for (size_t by = 0; by < block_size(); ++by) {
-          for (size_t ix = 0; ix < input_width(); ++ix) {
-            for (size_t bx = 0; bx < block_size(); ++bx) {
-              for (size_t c = 0; c < output_channels(); ++c) {
-                size_t input_offset =
-                    (c * block_size() * block_size() + by * block_size() + bx) * input_channel_stride() +
-                    iy * input_height_stride() +
-                    ix * element_size();
-                ASSERT_EQ(input_offset % element_size(), 0);
-                ASSERT_LT(input_offset / element_size(), input.size());
-
-                size_t output_offset =
-                    (iy * block_size() + by) * output_height_stride() +
-                    (ix * block_size() + bx) * output_width_stride() +
-                    c * element_size();
-                ASSERT_EQ(output_offset % element_size(), 0);
-                ASSERT_LT(output_offset / element_size(), output.size());
-
-                ASSERT_EQ(output[output_offset / element_size()],
-                          input[input_offset / element_size()])
-                    << "iy = " << iy << ", " << "by = " << by << ", "
-                    << "ix = " << ix << ", " << "bx = " << bx << ", "
-                    << "c = " << c;
+      for (size_t iy = 0; iy < input_height(); iy++) {
+        for (size_t by = 0; by < block_size(); by++) {
+          for (size_t ix = 0; ix < input_width(); ix++) {
+            for (size_t bx = 0; bx < block_size(); bx++) {
+              for (size_t oc = 0; oc < output_channels(); oc++) {
+                const size_t input_index =
+                  (((by * block_size() + bx) * output_channels() + oc) * input_height() + iy) * input_width() + ix;
+                const size_t output_index =
+                  ((iy * block_size() + by) * output_width() + ix * block_size() + bx) * output_channel_stride() + oc;
+                ASSERT_EQ(output[output_index], input[input_index])
+                  << "input x: " << ix << " / " << input_width()
+                  << ", input y: " << iy << " / " << input_height()
+                  << ", block x: " << bx << " / " << block_size()
+                  << ", block y: " << by << " / " << block_size()
+                  << ", output channel: " << oc << " / " << output_channels()
+                  << ", output stride: " << output_channel_stride();
               }
             }
           }
@@ -229,10 +150,6 @@
   size_t input_height_{1};
   size_t input_width_{1};
   size_t block_size_{2};
-  size_t element_size_{4};
-  size_t input_channel_stride_{0};
-  size_t input_height_stride_{0};
-  size_t output_height_stride_{0};
-  size_t output_width_stride_{0};
+  size_t output_channel_stride_{0};
   size_t iterations_{3};
 };
diff --git a/test/f32-ibilinear-chw.cc b/test/f32-ibilinear-chw.cc
index e4939e2..e6e6d8a 100644
--- a/test/f32-ibilinear-chw.cc
+++ b/test/f32-ibilinear-chw.cc
@@ -396,3 +396,359 @@
     }
   }
 #endif  // XNN_ARCH_WASMSIMD
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(F32_IBILINEAR_CHW__NEON_P4, pixels_eq_4) {
+    TEST_REQUIRES_ARM_NEON;
+    IBilinearMicrokernelTester()
+      .pixels(4)
+      .channels(1)
+      .TestCHW(xnn_f32_ibilinear_chw_ukernel__neon_p4);
+  }
+
+  TEST(F32_IBILINEAR_CHW__NEON_P4, pixels_div_4) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t pixels = 8; pixels < 40; pixels += 4) {
+      IBilinearMicrokernelTester()
+        .pixels(pixels)
+        .channels(1)
+        .TestCHW(xnn_f32_ibilinear_chw_ukernel__neon_p4);
+    }
+  }
+
+  TEST(F32_IBILINEAR_CHW__NEON_P4, pixels_lt_4) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t pixels = 1; pixels < 4; pixels++) {
+      IBilinearMicrokernelTester()
+        .pixels(pixels)
+        .channels(1)
+        .TestCHW(xnn_f32_ibilinear_chw_ukernel__neon_p4);
+    }
+  }
+
+  TEST(F32_IBILINEAR_CHW__NEON_P4, pixels_gt_4) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t pixels = 5; pixels < 8; pixels++) {
+      IBilinearMicrokernelTester()
+        .pixels(pixels)
+        .channels(1)
+        .TestCHW(xnn_f32_ibilinear_chw_ukernel__neon_p4);
+    }
+  }
+
+  TEST(F32_IBILINEAR_CHW__NEON_P4, channels_eq_1) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t pixels = 1; pixels <= 20; pixels += 3) {
+      IBilinearMicrokernelTester()
+        .pixels(pixels)
+        .channels(1)
+        .TestCHW(xnn_f32_ibilinear_chw_ukernel__neon_p4);
+    }
+  }
+
+  TEST(F32_IBILINEAR_CHW__NEON_P4, channels_gt_1) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t channels = 2; channels < 3; channels++) {
+      for (size_t pixels = 1; pixels <= 20; pixels += 3) {
+        IBilinearMicrokernelTester()
+          .pixels(pixels)
+          .channels(channels)
+          .TestCHW(xnn_f32_ibilinear_chw_ukernel__neon_p4);
+      }
+    }
+  }
+
+  TEST(F32_IBILINEAR_CHW__NEON_P4, input_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t pixels = 1; pixels < 20; pixels += 3) {
+      for (size_t channels = 1; channels <= 5; channels += 1) {
+        IBilinearMicrokernelTester()
+          .pixels(pixels)
+          .channels(channels)
+          .input_offset(7)
+          .TestCHW(xnn_f32_ibilinear_chw_ukernel__neon_p4);
+      }
+    }
+  }
+
+  TEST(F32_IBILINEAR_CHW__NEON_P4, input_stride) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t pixels = 1; pixels < 20; pixels += 3) {
+      for (size_t channels = 1; channels <= 5; channels += 1) {
+        IBilinearMicrokernelTester()
+          .pixels(pixels)
+          .channels(channels)
+          .input_stride(83)
+          .TestCHW(xnn_f32_ibilinear_chw_ukernel__neon_p4);
+      }
+    }
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(F32_IBILINEAR_CHW__NEON_P8, pixels_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    IBilinearMicrokernelTester()
+      .pixels(8)
+      .channels(1)
+      .TestCHW(xnn_f32_ibilinear_chw_ukernel__neon_p8);
+  }
+
+  TEST(F32_IBILINEAR_CHW__NEON_P8, pixels_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t pixels = 16; pixels < 80; pixels += 8) {
+      IBilinearMicrokernelTester()
+        .pixels(pixels)
+        .channels(1)
+        .TestCHW(xnn_f32_ibilinear_chw_ukernel__neon_p8);
+    }
+  }
+
+  TEST(F32_IBILINEAR_CHW__NEON_P8, pixels_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t pixels = 1; pixels < 8; pixels++) {
+      IBilinearMicrokernelTester()
+        .pixels(pixels)
+        .channels(1)
+        .TestCHW(xnn_f32_ibilinear_chw_ukernel__neon_p8);
+    }
+  }
+
+  TEST(F32_IBILINEAR_CHW__NEON_P8, pixels_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t pixels = 9; pixels < 16; pixels++) {
+      IBilinearMicrokernelTester()
+        .pixels(pixels)
+        .channels(1)
+        .TestCHW(xnn_f32_ibilinear_chw_ukernel__neon_p8);
+    }
+  }
+
+  TEST(F32_IBILINEAR_CHW__NEON_P8, channels_eq_1) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t pixels = 1; pixels <= 40; pixels += 7) {
+      IBilinearMicrokernelTester()
+        .pixels(pixels)
+        .channels(1)
+        .TestCHW(xnn_f32_ibilinear_chw_ukernel__neon_p8);
+    }
+  }
+
+  TEST(F32_IBILINEAR_CHW__NEON_P8, channels_gt_1) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t channels = 2; channels < 3; channels++) {
+      for (size_t pixels = 1; pixels <= 40; pixels += 7) {
+        IBilinearMicrokernelTester()
+          .pixels(pixels)
+          .channels(channels)
+          .TestCHW(xnn_f32_ibilinear_chw_ukernel__neon_p8);
+      }
+    }
+  }
+
+  TEST(F32_IBILINEAR_CHW__NEON_P8, input_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t pixels = 1; pixels < 40; pixels += 7) {
+      for (size_t channels = 1; channels <= 5; channels += 1) {
+        IBilinearMicrokernelTester()
+          .pixels(pixels)
+          .channels(channels)
+          .input_offset(7)
+          .TestCHW(xnn_f32_ibilinear_chw_ukernel__neon_p8);
+      }
+    }
+  }
+
+  TEST(F32_IBILINEAR_CHW__NEON_P8, input_stride) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t pixels = 1; pixels < 40; pixels += 7) {
+      for (size_t channels = 1; channels <= 5; channels += 1) {
+        IBilinearMicrokernelTester()
+          .pixels(pixels)
+          .channels(channels)
+          .input_stride(163)
+          .TestCHW(xnn_f32_ibilinear_chw_ukernel__neon_p8);
+      }
+    }
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(F32_IBILINEAR_CHW__NEONFMA_P4, pixels_eq_4) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    IBilinearMicrokernelTester()
+      .pixels(4)
+      .channels(1)
+      .TestCHW(xnn_f32_ibilinear_chw_ukernel__neonfma_p4);
+  }
+
+  TEST(F32_IBILINEAR_CHW__NEONFMA_P4, pixels_div_4) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t pixels = 8; pixels < 40; pixels += 4) {
+      IBilinearMicrokernelTester()
+        .pixels(pixels)
+        .channels(1)
+        .TestCHW(xnn_f32_ibilinear_chw_ukernel__neonfma_p4);
+    }
+  }
+
+  TEST(F32_IBILINEAR_CHW__NEONFMA_P4, pixels_lt_4) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t pixels = 1; pixels < 4; pixels++) {
+      IBilinearMicrokernelTester()
+        .pixels(pixels)
+        .channels(1)
+        .TestCHW(xnn_f32_ibilinear_chw_ukernel__neonfma_p4);
+    }
+  }
+
+  TEST(F32_IBILINEAR_CHW__NEONFMA_P4, pixels_gt_4) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t pixels = 5; pixels < 8; pixels++) {
+      IBilinearMicrokernelTester()
+        .pixels(pixels)
+        .channels(1)
+        .TestCHW(xnn_f32_ibilinear_chw_ukernel__neonfma_p4);
+    }
+  }
+
+  TEST(F32_IBILINEAR_CHW__NEONFMA_P4, channels_eq_1) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t pixels = 1; pixels <= 20; pixels += 3) {
+      IBilinearMicrokernelTester()
+        .pixels(pixels)
+        .channels(1)
+        .TestCHW(xnn_f32_ibilinear_chw_ukernel__neonfma_p4);
+    }
+  }
+
+  TEST(F32_IBILINEAR_CHW__NEONFMA_P4, channels_gt_1) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t channels = 2; channels < 3; channels++) {
+      for (size_t pixels = 1; pixels <= 20; pixels += 3) {
+        IBilinearMicrokernelTester()
+          .pixels(pixels)
+          .channels(channels)
+          .TestCHW(xnn_f32_ibilinear_chw_ukernel__neonfma_p4);
+      }
+    }
+  }
+
+  TEST(F32_IBILINEAR_CHW__NEONFMA_P4, input_offset) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t pixels = 1; pixels < 20; pixels += 3) {
+      for (size_t channels = 1; channels <= 5; channels += 1) {
+        IBilinearMicrokernelTester()
+          .pixels(pixels)
+          .channels(channels)
+          .input_offset(7)
+          .TestCHW(xnn_f32_ibilinear_chw_ukernel__neonfma_p4);
+      }
+    }
+  }
+
+  TEST(F32_IBILINEAR_CHW__NEONFMA_P4, input_stride) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t pixels = 1; pixels < 20; pixels += 3) {
+      for (size_t channels = 1; channels <= 5; channels += 1) {
+        IBilinearMicrokernelTester()
+          .pixels(pixels)
+          .channels(channels)
+          .input_stride(83)
+          .TestCHW(xnn_f32_ibilinear_chw_ukernel__neonfma_p4);
+      }
+    }
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(F32_IBILINEAR_CHW__NEONFMA_P8, pixels_eq_8) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    IBilinearMicrokernelTester()
+      .pixels(8)
+      .channels(1)
+      .TestCHW(xnn_f32_ibilinear_chw_ukernel__neonfma_p8);
+  }
+
+  TEST(F32_IBILINEAR_CHW__NEONFMA_P8, pixels_div_8) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t pixels = 16; pixels < 80; pixels += 8) {
+      IBilinearMicrokernelTester()
+        .pixels(pixels)
+        .channels(1)
+        .TestCHW(xnn_f32_ibilinear_chw_ukernel__neonfma_p8);
+    }
+  }
+
+  TEST(F32_IBILINEAR_CHW__NEONFMA_P8, pixels_lt_8) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t pixels = 1; pixels < 8; pixels++) {
+      IBilinearMicrokernelTester()
+        .pixels(pixels)
+        .channels(1)
+        .TestCHW(xnn_f32_ibilinear_chw_ukernel__neonfma_p8);
+    }
+  }
+
+  TEST(F32_IBILINEAR_CHW__NEONFMA_P8, pixels_gt_8) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t pixels = 9; pixels < 16; pixels++) {
+      IBilinearMicrokernelTester()
+        .pixels(pixels)
+        .channels(1)
+        .TestCHW(xnn_f32_ibilinear_chw_ukernel__neonfma_p8);
+    }
+  }
+
+  TEST(F32_IBILINEAR_CHW__NEONFMA_P8, channels_eq_1) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t pixels = 1; pixels <= 40; pixels += 7) {
+      IBilinearMicrokernelTester()
+        .pixels(pixels)
+        .channels(1)
+        .TestCHW(xnn_f32_ibilinear_chw_ukernel__neonfma_p8);
+    }
+  }
+
+  TEST(F32_IBILINEAR_CHW__NEONFMA_P8, channels_gt_1) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t channels = 2; channels < 3; channels++) {
+      for (size_t pixels = 1; pixels <= 40; pixels += 7) {
+        IBilinearMicrokernelTester()
+          .pixels(pixels)
+          .channels(channels)
+          .TestCHW(xnn_f32_ibilinear_chw_ukernel__neonfma_p8);
+      }
+    }
+  }
+
+  TEST(F32_IBILINEAR_CHW__NEONFMA_P8, input_offset) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t pixels = 1; pixels < 40; pixels += 7) {
+      for (size_t channels = 1; channels <= 5; channels += 1) {
+        IBilinearMicrokernelTester()
+          .pixels(pixels)
+          .channels(channels)
+          .input_offset(7)
+          .TestCHW(xnn_f32_ibilinear_chw_ukernel__neonfma_p8);
+      }
+    }
+  }
+
+  TEST(F32_IBILINEAR_CHW__NEONFMA_P8, input_stride) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t pixels = 1; pixels < 40; pixels += 7) {
+      for (size_t channels = 1; channels <= 5; channels += 1) {
+        IBilinearMicrokernelTester()
+          .pixels(pixels)
+          .channels(channels)
+          .input_stride(163)
+          .TestCHW(xnn_f32_ibilinear_chw_ukernel__neonfma_p8);
+      }
+    }
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
diff --git a/test/f32-ibilinear-chw.yaml b/test/f32-ibilinear-chw.yaml
index dcc5eeb..1ef8aef 100644
--- a/test/f32-ibilinear-chw.yaml
+++ b/test/f32-ibilinear-chw.yaml
@@ -7,3 +7,7 @@
 - name: xnn_f32_ibilinear_chw_ukernel__scalar_p4
 - name: xnn_f32_ibilinear_chw_ukernel__wasmsimd_p4
 - name: xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8
+- name: xnn_f32_ibilinear_chw_ukernel__neon_p4
+- name: xnn_f32_ibilinear_chw_ukernel__neon_p8
+- name: xnn_f32_ibilinear_chw_ukernel__neonfma_p4
+- name: xnn_f32_ibilinear_chw_ukernel__neonfma_p8
diff --git a/test/gemm-microkernel-tester.h b/test/gemm-microkernel-tester.h
index 20ec6d4..c0ef22b 100644
--- a/test/gemm-microkernel-tester.h
+++ b/test/gemm-microkernel-tester.h
@@ -440,7 +440,7 @@
     auto rng = std::mt19937(random_device());
     auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), rng);
     auto i8rng = std::bind(
-      std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()), rng);
+      std::uniform_int_distribution<int32_t>(-127, std::numeric_limits<int8_t>::max()), rng);
 
     std::vector<int8_t> a((m() - 1) * a_stride() + k() + XNN_EXTRA_BYTES / sizeof(int8_t));
     std::vector<int8_t> b(n() * k());
diff --git a/test/qs8-gemm-minmax.cc b/test/qs8-gemm-minmax.cc
index 009c235..87e94ef 100644
--- a/test/qs8-gemm-minmax.cc
+++ b/test/qs8-gemm-minmax.cc
@@ -22,6 +22,918 @@
 #include "gemm-microkernel-tester.h"
 
 
+#if XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .cn_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .a_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .a_stride(37)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .a_stride(163)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .cm_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MULL_PADAL, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(8)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mull_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MULL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(8)
+      .cn_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mull_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MULL_PADAL, k_eq_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mull_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MULL_PADAL, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MULL_PADAL, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MULL_PADAL, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MULL_PADAL, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MULL_PADAL, k_lt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MULL_PADAL, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MULL_PADAL, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MULL_PADAL, k_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MULL_PADAL, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MULL_PADAL, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MULL_PADAL, k_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MULL_PADAL, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MULL_PADAL, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MULL_PADAL, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MULL_PADAL, n_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MULL_PADAL, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MULL_PADAL, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MULL_PADAL, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MULL_PADAL, n_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MULL_PADAL, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MULL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MULL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mull_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MULL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mull_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__AARCH64_NEON_MULL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(8)
+      .cm_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mull_padal);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(QS8_GEMM_MINMAX_1X8__NEON_MLAL_LANE, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
@@ -935,6 +1847,918 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MLAL_LANE, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MLAL_LANE, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .cn_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MLAL_LANE, k_eq_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MLAL_LANE, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MLAL_LANE, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MLAL_LANE, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MLAL_LANE, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MLAL_LANE, k_lt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MLAL_LANE, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MLAL_LANE, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MLAL_LANE, k_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MLAL_LANE, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MLAL_LANE, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MLAL_LANE, k_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MLAL_LANE, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MLAL_LANE, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MLAL_LANE, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MLAL_LANE, n_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MLAL_LANE, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MLAL_LANE, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MLAL_LANE, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MLAL_LANE, n_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MLAL_LANE, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MLAL_LANE, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MLAL_LANE, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MLAL_LANE, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MLAL_LANE, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .cm_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MLAL_LANE, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(8)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MLAL_LANE, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(8)
+      .cn_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MLAL_LANE, k_eq_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MLAL_LANE, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MLAL_LANE, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MLAL_LANE, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MLAL_LANE, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MLAL_LANE, k_lt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MLAL_LANE, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MLAL_LANE, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MLAL_LANE, k_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MLAL_LANE, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MLAL_LANE, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MLAL_LANE, k_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MLAL_LANE, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MLAL_LANE, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MLAL_LANE, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MLAL_LANE, n_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MLAL_LANE, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MLAL_LANE, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MLAL_LANE, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MLAL_LANE, n_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MLAL_LANE, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MLAL_LANE, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MLAL_LANE, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MLAL_LANE, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MLAL_LANE, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(8)
+      .cm_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(QS8_GEMM_MINMAX_1X16__NEON_MLAL_LANE, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
@@ -1847,6 +3671,22806 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MLAL_LANE, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(8)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MLAL_LANE, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(8)
+      .cn_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MLAL_LANE, k_eq_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MLAL_LANE, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MLAL_LANE, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MLAL_LANE, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MLAL_LANE, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MLAL_LANE, k_lt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MLAL_LANE, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MLAL_LANE, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MLAL_LANE, k_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MLAL_LANE, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MLAL_LANE, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MLAL_LANE, k_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MLAL_LANE, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MLAL_LANE, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MLAL_LANE, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MLAL_LANE, n_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MLAL_LANE, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MLAL_LANE, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MLAL_LANE, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MLAL_LANE, n_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MLAL_LANE, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MLAL_LANE, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MLAL_LANE, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MLAL_LANE, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MLAL_LANE, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(8)
+      .cm_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MLAL_LANE, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MLAL_LANE, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .cn_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MLAL_LANE, k_eq_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MLAL_LANE, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MLAL_LANE, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MLAL_LANE, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MLAL_LANE, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MLAL_LANE, k_lt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MLAL_LANE, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MLAL_LANE, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MLAL_LANE, k_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MLAL_LANE, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MLAL_LANE, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MLAL_LANE, k_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MLAL_LANE, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MLAL_LANE, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MLAL_LANE, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MLAL_LANE, n_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MLAL_LANE, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MLAL_LANE, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MLAL_LANE, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MLAL_LANE, n_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MLAL_LANE, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MLAL_LANE, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MLAL_LANE, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MLAL_LANE, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MLAL_LANE, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .cm_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(8)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(8)
+      .cn_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, k_eq_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, k_lt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, k_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, k_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, n_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, n_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(8)
+      .cm_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(8)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(8)
+      .cn_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, k_eq_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, k_lt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, k_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, k_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, n_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, n_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(8)
+      .cm_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(8)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(8)
+      .cn_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, k_eq_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, k_lt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, k_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, k_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, n_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, n_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(8)
+      .cm_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .cn_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, k_eq_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, k_lt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, k_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, k_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, n_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, n_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .cm_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(8)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(8)
+      .cn_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, k_eq_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, k_lt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, k_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, k_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, n_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, n_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(8)
+      .cm_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(8)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(8)
+      .cn_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, k_eq_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, k_lt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, k_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, k_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, n_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, n_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(8)
+      .cm_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(8)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(8)
+      .cn_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, k_eq_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, k_lt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, k_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, k_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, n_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, n_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(8)
+      .cm_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .cn_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, k_eq_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, k_lt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, k_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, k_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, n_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, n_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .cm_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(8)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(8)
+      .cn_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, k_eq_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, k_lt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, k_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, k_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, n_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, n_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(8)
+      .cm_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(8)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(8)
+      .cn_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, k_eq_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, k_lt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, k_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, k_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, n_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, n_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(8)
+      .cm_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(8)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(8)
+      .cn_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, k_eq_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, k_lt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, k_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, k_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, n_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, n_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(8)
+      .cm_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .cn_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, k_eq_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, k_lt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, k_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, k_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, n_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, n_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .cm_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(8)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(8)
+      .cn_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, k_eq_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, k_lt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, k_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, k_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, n_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, n_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(8)
+      .cm_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(8)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(8)
+      .cn_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, k_eq_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, k_lt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, k_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, k_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, n_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, n_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(8)
+      .cm_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(8)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(8)
+      .cn_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, k_eq_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, k_lt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, k_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, k_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, n_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, n_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(8)
+      .cm_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .cn_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, k_eq_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, k_lt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, k_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, k_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, n_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, n_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .cm_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .cn_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .a_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, k_lt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, k_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .a_stride(37)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, k_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .a_stride(163)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, n_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .cm_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .cn_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .a_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, k_lt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, k_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .a_stride(37)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, k_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .a_stride(163)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, n_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .cm_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(16)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(16)
+      .cn_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(16)
+      .a_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, k_lt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, k_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .a_stride(37)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, k_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .a_stride(163)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, n_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(16)
+      .cm_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(16)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(16)
+      .cn_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(16)
+      .a_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, k_lt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, k_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .a_stride(37)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, k_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .a_stride(163)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, n_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(16)
+      .cm_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(16)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(16)
+      .cn_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(16)
+      .a_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, k_lt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, k_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .a_stride(37)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, k_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .a_stride(163)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, n_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(16)
+      .cm_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(16)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(16)
+      .cn_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(16)
+      .a_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, k_lt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, k_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .a_stride(37)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, k_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .a_stride(163)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, n_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(16)
+      .cm_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(16)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(16)
+      .cn_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(16)
+      .a_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, k_lt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, k_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .a_stride(37)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, k_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .a_stride(163)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, n_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(16)
+      .cm_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .cn_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .a_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, k_lt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, k_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(37)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, k_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(163)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, n_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .cm_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MULL_PADAL, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(8)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mull_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MULL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(8)
+      .cn_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mull_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MULL_PADAL, k_eq_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mull_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MULL_PADAL, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MULL_PADAL, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MULL_PADAL, k_lt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MULL_PADAL, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MULL_PADAL, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MULL_PADAL, k_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MULL_PADAL, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MULL_PADAL, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MULL_PADAL, k_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MULL_PADAL, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MULL_PADAL, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MULL_PADAL, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MULL_PADAL, n_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MULL_PADAL, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MULL_PADAL, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MULL_PADAL, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MULL_PADAL, n_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MULL_PADAL, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MULL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MULL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mull_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MULL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mull_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MULL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(8)
+      .cm_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mull_padal);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MULL_PADAL, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(8)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mull_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MULL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(8)
+      .cn_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mull_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MULL_PADAL, k_eq_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mull_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MULL_PADAL, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MULL_PADAL, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MULL_PADAL, k_lt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MULL_PADAL, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MULL_PADAL, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MULL_PADAL, k_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MULL_PADAL, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MULL_PADAL, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MULL_PADAL, k_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MULL_PADAL, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MULL_PADAL, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MULL_PADAL, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MULL_PADAL, n_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MULL_PADAL, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MULL_PADAL, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MULL_PADAL, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MULL_PADAL, n_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MULL_PADAL, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MULL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MULL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mull_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MULL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mull_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MULL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(8)
+      .cm_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mull_padal);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MULL_PADAL, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(8)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MULL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(8)
+      .cn_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MULL_PADAL, k_eq_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MULL_PADAL, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MULL_PADAL, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MULL_PADAL, k_lt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MULL_PADAL, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MULL_PADAL, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MULL_PADAL, k_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MULL_PADAL, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MULL_PADAL, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MULL_PADAL, k_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MULL_PADAL, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MULL_PADAL, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MULL_PADAL, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MULL_PADAL, n_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MULL_PADAL, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MULL_PADAL, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MULL_PADAL, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MULL_PADAL, n_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MULL_PADAL, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MULL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MULL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MULL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MULL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(8)
+      .cm_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MULL_PADAL, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MULL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .cn_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MULL_PADAL, k_eq_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MULL_PADAL, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MULL_PADAL, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MULL_PADAL, k_lt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MULL_PADAL, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MULL_PADAL, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MULL_PADAL, k_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MULL_PADAL, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MULL_PADAL, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MULL_PADAL, k_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MULL_PADAL, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MULL_PADAL, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MULL_PADAL, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MULL_PADAL, n_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MULL_PADAL, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MULL_PADAL, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MULL_PADAL, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MULL_PADAL, n_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MULL_PADAL, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MULL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MULL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MULL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MULL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .cm_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MULL_PADAL, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(8)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MULL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(8)
+      .cn_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MULL_PADAL, k_eq_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MULL_PADAL, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MULL_PADAL, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MULL_PADAL, k_lt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MULL_PADAL, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MULL_PADAL, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MULL_PADAL, k_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MULL_PADAL, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MULL_PADAL, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MULL_PADAL, k_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MULL_PADAL, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MULL_PADAL, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MULL_PADAL, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MULL_PADAL, n_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MULL_PADAL, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MULL_PADAL, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MULL_PADAL, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MULL_PADAL, n_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MULL_PADAL, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MULL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MULL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MULL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MULL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(8)
+      .cm_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MULL_PADAL, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(8)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MULL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(8)
+      .cn_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MULL_PADAL, k_eq_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MULL_PADAL, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MULL_PADAL, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MULL_PADAL, k_lt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MULL_PADAL, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MULL_PADAL, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MULL_PADAL, k_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MULL_PADAL, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MULL_PADAL, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MULL_PADAL, k_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MULL_PADAL, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MULL_PADAL, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MULL_PADAL, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MULL_PADAL, n_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MULL_PADAL, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MULL_PADAL, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MULL_PADAL, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MULL_PADAL, n_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MULL_PADAL, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MULL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MULL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MULL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MULL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(8)
+      .cm_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MULL_PADAL, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(8)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MULL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(8)
+      .cn_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MULL_PADAL, k_eq_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MULL_PADAL, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MULL_PADAL, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MULL_PADAL, k_lt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MULL_PADAL, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MULL_PADAL, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MULL_PADAL, k_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MULL_PADAL, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MULL_PADAL, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MULL_PADAL, k_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MULL_PADAL, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MULL_PADAL, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MULL_PADAL, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MULL_PADAL, n_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MULL_PADAL, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MULL_PADAL, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MULL_PADAL, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MULL_PADAL, n_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MULL_PADAL, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MULL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MULL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MULL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MULL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(8)
+      .cm_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MULL_PADAL, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MULL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .cn_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MULL_PADAL, k_eq_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MULL_PADAL, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MULL_PADAL, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MULL_PADAL, k_lt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MULL_PADAL, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MULL_PADAL, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MULL_PADAL, k_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MULL_PADAL, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MULL_PADAL, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MULL_PADAL, k_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MULL_PADAL, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MULL_PADAL, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MULL_PADAL, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MULL_PADAL, n_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MULL_PADAL, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MULL_PADAL, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MULL_PADAL, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MULL_PADAL, n_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MULL_PADAL, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MULL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MULL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MULL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MULL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .cm_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .cn_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .a_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .a_stride(37)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, k_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .a_stride(163)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, n_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, n_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .cm_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .cn_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .a_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .a_stride(37)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, k_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .a_stride(163)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, n_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, n_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .cm_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(16)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(16)
+      .cn_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(16)
+      .a_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .a_stride(37)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, k_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .a_stride(163)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, n_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, n_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(16)
+      .cm_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(16)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(16)
+      .cn_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(16)
+      .a_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .a_stride(37)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, k_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .a_stride(163)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, n_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, n_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(16)
+      .cm_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(16)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(16)
+      .cn_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(16)
+      .a_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .a_stride(37)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, k_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .a_stride(163)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, n_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, n_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(16)
+      .cm_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(16)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(16)
+      .cn_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(16)
+      .a_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .a_stride(37)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, k_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .a_stride(163)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, n_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, n_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(16)
+      .cm_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(16)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(16)
+      .cn_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(16)
+      .a_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .a_stride(37)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, k_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .a_stride(163)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, n_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, n_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(16)
+      .cm_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .cn_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .a_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(37)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, k_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(163)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, n_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, n_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .cm_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .cn_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .a_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .a_stride(37)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, k_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .a_stride(163)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, n_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, n_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .cm_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .cn_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .a_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .a_stride(37)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, k_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .a_stride(163)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, n_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, n_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .cm_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(16)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(16)
+      .cn_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(16)
+      .a_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .a_stride(37)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, k_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .a_stride(163)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, n_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, n_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(16)
+      .cm_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(16)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(16)
+      .cn_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(16)
+      .a_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .a_stride(37)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, k_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .a_stride(163)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, n_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, n_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(16)
+      .cm_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(16)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(16)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(16)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(16)
+      .cn_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(16)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(16)
+      .a_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .a_stride(37)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, k_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .a_stride(163)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, n_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, n_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(16)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(16)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(16)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(16)
+      .cm_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(16)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(16)
+      .cn_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(16)
+      .a_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .a_stride(37)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, k_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .a_stride(163)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, n_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, n_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(16)
+      .cm_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(16)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(16)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(16)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(16)
+      .cn_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(16)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(16)
+      .a_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .a_stride(37)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, k_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .a_stride(163)
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, n_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, n_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(16)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(16)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(16)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(16)
+      .cm_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(16)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(16)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .cn_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, k_eq_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(16)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .a_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, k_lt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, k_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(37)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, k_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(163)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, n_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, n_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(16)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(16)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(16)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .cm_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(QS8_GEMM_MINMAX_1X8C4__NEONDOT, k_eq_8) {
     TEST_REQUIRES_ARM_NEON_DOT;
     GemmMicrokernelTester()
@@ -3671,462 +28295,6 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_12X8C4__NEONDOT, k_eq_8) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(12)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(12)
-      .n(8)
-      .k(8)
-      .Test(xnn_qs8_gemm_minmax_ukernel_12x8c4__neondot);
-  }
-
-  TEST(QS8_GEMM_MINMAX_12X8C4__NEONDOT, strided_cn) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(12)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(12)
-      .n(8)
-      .k(8)
-      .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_12x8c4__neondot);
-  }
-
-  TEST(QS8_GEMM_MINMAX_12X8C4__NEONDOT, k_eq_8_strided_a) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(12)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(12)
-      .n(8)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_12x8c4__neondot);
-  }
-
-  TEST(QS8_GEMM_MINMAX_12X8C4__NEONDOT, k_eq_8_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t m = 1; m <= 12; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .mr(12)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_12x8c4__neondot);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_12X8C4__NEONDOT, k_eq_8_subtile_m) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t m = 1; m <= 12; m++) {
-      GemmMicrokernelTester()
-        .mr(12)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_12x8c4__neondot);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_12X8C4__NEONDOT, k_eq_8_subtile_n) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(12)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(12)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_12x8c4__neondot);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_12X8C4__NEONDOT, k_lt_8) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(12)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(12)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_12x8c4__neondot);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_12X8C4__NEONDOT, k_lt_8_strided_a) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(12)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(12)
-        .n(8)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_ukernel_12x8c4__neondot);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_12X8C4__NEONDOT, k_lt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 12; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(12)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_12x8c4__neondot);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_12X8C4__NEONDOT, k_gt_8) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(12)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(12)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_12x8c4__neondot);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_12X8C4__NEONDOT, k_gt_8_strided_a) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(12)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(12)
-        .n(8)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_ukernel_12x8c4__neondot);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_12X8C4__NEONDOT, k_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 12; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(12)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_12x8c4__neondot);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_12X8C4__NEONDOT, k_div_8) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(12)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(12)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_12x8c4__neondot);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_12X8C4__NEONDOT, k_div_8_strided_a) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(12)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(12)
-        .n(8)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_ukernel_12x8c4__neondot);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_12X8C4__NEONDOT, k_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 12; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(12)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_12x8c4__neondot);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_12X8C4__NEONDOT, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(12)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(12)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_12x8c4__neondot);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_12X8C4__NEONDOT, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(12)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(12)
-          .n(8)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_ukernel_12x8c4__neondot);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_12X8C4__NEONDOT, n_gt_8_strided_a) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(12)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(12)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_12x8c4__neondot);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_12X8C4__NEONDOT, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 12; m++) {
-          GemmMicrokernelTester()
-            .mr(12)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_12x8c4__neondot);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_12X8C4__NEONDOT, n_div_8) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(12)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(12)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_12x8c4__neondot);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_12X8C4__NEONDOT, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(12)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(12)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_ukernel_12x8c4__neondot);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_12X8C4__NEONDOT, n_div_8_strided_a) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(12)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(12)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_12x8c4__neondot);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_12X8C4__NEONDOT, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 12; m++) {
-          GemmMicrokernelTester()
-            .mr(12)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_12x8c4__neondot);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_12X8C4__NEONDOT, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 12; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(12)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_12x8c4__neondot);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_12X8C4__NEONDOT, qmin) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(12)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(12)
-      .n(8)
-      .k(8)
-      .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_12x8c4__neondot);
-  }
-
-  TEST(QS8_GEMM_MINMAX_12X8C4__NEONDOT, qmax) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(12)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(12)
-      .n(8)
-      .k(8)
-      .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_12x8c4__neondot);
-  }
-
-  TEST(QS8_GEMM_MINMAX_12X8C4__NEONDOT, strided_cm) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(12)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(12)
-      .n(8)
-      .k(8)
-      .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_12x8c4__neondot);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(QS8_GEMM_MINMAX_1X16C4__NEONDOT, k_eq_8) {
     TEST_REQUIRES_ARM_NEON_DOT;
     GemmMicrokernelTester()
@@ -5951,6 +30119,918 @@
 
 
 #if XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .cn_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .a_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, k_lt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, k_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .a_stride(37)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, k_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .a_stride(163)
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+  }
+
+  TEST(QS8_GEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .cm_stride(11)
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_1X16C4__AARCH64_NEONDOT_LD32, k_eq_4) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(4)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld32);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C4__AARCH64_NEONDOT_LD32, strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(4)
+      .cn_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld32);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C4__AARCH64_NEONDOT_LD32, k_eq_4_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(4)
+      .a_stride(7)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld32);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C4__AARCH64_NEONDOT_LD32, k_eq_4_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(4)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C4__AARCH64_NEONDOT_LD32, k_eq_4_subtile_m) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(4)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C4__AARCH64_NEONDOT_LD32, k_eq_4_subtile_n) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(4)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C4__AARCH64_NEONDOT_LD32, k_lt_4) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 4; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C4__AARCH64_NEONDOT_LD32, k_lt_4_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 4; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .a_stride(7)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C4__AARCH64_NEONDOT_LD32, k_lt_4_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 4; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C4__AARCH64_NEONDOT_LD32, k_gt_4) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 5; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C4__AARCH64_NEONDOT_LD32, k_gt_4_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 5; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C4__AARCH64_NEONDOT_LD32, k_gt_4_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 5; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C4__AARCH64_NEONDOT_LD32, k_div_4) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 8; k <= 40; k += 4) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C4__AARCH64_NEONDOT_LD32, k_div_4_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 8; k <= 40; k += 4) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .a_stride(43)
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C4__AARCH64_NEONDOT_LD32, k_div_4_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 8; k <= 40; k += 4) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C4__AARCH64_NEONDOT_LD32, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C4__AARCH64_NEONDOT_LD32, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C4__AARCH64_NEONDOT_LD32, n_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(23)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C4__AARCH64_NEONDOT_LD32, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C4__AARCH64_NEONDOT_LD32, n_div_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C4__AARCH64_NEONDOT_LD32, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C4__AARCH64_NEONDOT_LD32, n_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(23)
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C4__AARCH64_NEONDOT_LD32, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C4__AARCH64_NEONDOT_LD32, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k <= 20; k += 5) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C4__AARCH64_NEONDOT_LD32, qmin) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(4)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld32);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C4__AARCH64_NEONDOT_LD32, qmax) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(4)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld32);
+  }
+
+  TEST(QS8_GEMM_MINMAX_1X16C4__AARCH64_NEONDOT_LD32, strided_cm) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(4)
+      .cm_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld32);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
   TEST(QS8_GEMM_MINMAX_1X16C4__AARCH64_NEONDOT_LD64, k_eq_8) {
     TEST_REQUIRES_ARM_NEON_DOT;
     GemmMicrokernelTester()
@@ -6407,6 +31487,462 @@
 
 
 #if XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD32, k_eq_4) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(4)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld32);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD32, strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(4)
+      .cn_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld32);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD32, k_eq_4_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(4)
+      .a_stride(7)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld32);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD32, k_eq_4_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(4)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD32, k_eq_4_subtile_m) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(4)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD32, k_eq_4_subtile_n) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(4)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD32, k_lt_4) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 4; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD32, k_lt_4_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 4; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(7)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD32, k_lt_4_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 4; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD32, k_gt_4) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 5; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD32, k_gt_4_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 5; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD32, k_gt_4_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 5; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD32, k_div_4) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 8; k <= 40; k += 4) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD32, k_div_4_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 8; k <= 40; k += 4) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(43)
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD32, k_div_4_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 8; k <= 40; k += 4) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD32, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD32, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD32, n_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(23)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD32, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD32, n_div_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD32, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD32, n_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(23)
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD32, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD32, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k <= 20; k += 5) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD32, qmin) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(4)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld32);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD32, qmax) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(4)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld32);
+  }
+
+  TEST(QS8_GEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD32, strided_cm) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(4)
+      .cm_stride(19)
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld32);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
   TEST(QS8_GEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8) {
     TEST_REQUIRES_ARM_NEON_DOT;
     GemmMicrokernelTester()
@@ -42375,857 +67911,3 @@
       .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__wasmsimd);
   }
 #endif  // XNN_ARCH_WASMSIMD
-
-
-TEST(QS8_GEMM_MINMAX_8X8C4__SCALAR, k_eq_4) {
-  GemmMicrokernelTester()
-    .mr(8)
-    .nr(8)
-    .kr(4)
-    .sr(1)
-    .m(8)
-    .n(8)
-    .k(4)
-    .Test(xnn_qs8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-}
-
-TEST(QS8_GEMM_MINMAX_8X8C4__SCALAR, strided_cn) {
-  GemmMicrokernelTester()
-    .mr(8)
-    .nr(8)
-    .kr(4)
-    .sr(1)
-    .m(8)
-    .n(8)
-    .k(4)
-    .cn_stride(11)
-    .Test(xnn_qs8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-}
-
-TEST(QS8_GEMM_MINMAX_8X8C4__SCALAR, k_eq_4_strided_a) {
-  GemmMicrokernelTester()
-    .mr(8)
-    .nr(8)
-    .kr(4)
-    .sr(1)
-    .m(8)
-    .n(8)
-    .k(4)
-    .a_stride(7)
-    .Test(xnn_qs8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-}
-
-TEST(QS8_GEMM_MINMAX_8X8C4__SCALAR, k_eq_4_subtile) {
-  for (uint32_t m = 1; m <= 8; m++) {
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(8)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(m)
-        .n(n)
-        .k(4)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-    }
-  }
-}
-
-TEST(QS8_GEMM_MINMAX_8X8C4__SCALAR, k_eq_4_subtile_m) {
-  for (uint32_t m = 1; m <= 8; m++) {
-    GemmMicrokernelTester()
-      .mr(8)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(m)
-      .n(8)
-      .k(4)
-      .iterations(1)
-      .Test(xnn_qs8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-  }
-}
-
-TEST(QS8_GEMM_MINMAX_8X8C4__SCALAR, k_eq_4_subtile_n) {
-  for (uint32_t n = 1; n <= 8; n++) {
-    GemmMicrokernelTester()
-      .mr(8)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(8)
-      .n(n)
-      .k(4)
-      .iterations(1)
-      .Test(xnn_qs8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-  }
-}
-
-TEST(QS8_GEMM_MINMAX_8X8C4__SCALAR, k_lt_4) {
-  for (size_t k = 1; k < 4; k++) {
-    GemmMicrokernelTester()
-      .mr(8)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(8)
-      .n(8)
-      .k(k)
-      .Test(xnn_qs8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-  }
-}
-
-TEST(QS8_GEMM_MINMAX_8X8C4__SCALAR, k_lt_4_strided_a) {
-  for (size_t k = 1; k < 4; k++) {
-    GemmMicrokernelTester()
-      .mr(8)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(8)
-      .n(8)
-      .k(k)
-      .a_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-  }
-}
-
-TEST(QS8_GEMM_MINMAX_8X8C4__SCALAR, k_lt_4_subtile) {
-  for (size_t k = 1; k < 4; k++) {
-    for (uint32_t m = 1; m <= 8; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .mr(8)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(k)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-}
-
-TEST(QS8_GEMM_MINMAX_8X8C4__SCALAR, k_gt_4) {
-  for (size_t k = 5; k < 8; k++) {
-    GemmMicrokernelTester()
-      .mr(8)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(8)
-      .n(8)
-      .k(k)
-      .Test(xnn_qs8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-  }
-}
-
-TEST(QS8_GEMM_MINMAX_8X8C4__SCALAR, k_gt_4_strided_a) {
-  for (size_t k = 5; k < 8; k++) {
-    GemmMicrokernelTester()
-      .mr(8)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(8)
-      .n(8)
-      .k(k)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-  }
-}
-
-TEST(QS8_GEMM_MINMAX_8X8C4__SCALAR, k_gt_4_subtile) {
-  for (size_t k = 5; k < 8; k++) {
-    for (uint32_t m = 1; m <= 8; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .mr(8)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(k)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-}
-
-TEST(QS8_GEMM_MINMAX_8X8C4__SCALAR, k_div_4) {
-  for (size_t k = 8; k <= 40; k += 4) {
-    GemmMicrokernelTester()
-      .mr(8)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(8)
-      .n(8)
-      .k(k)
-      .Test(xnn_qs8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-  }
-}
-
-TEST(QS8_GEMM_MINMAX_8X8C4__SCALAR, k_div_4_strided_a) {
-  for (size_t k = 8; k <= 40; k += 4) {
-    GemmMicrokernelTester()
-      .mr(8)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(8)
-      .n(8)
-      .k(k)
-      .a_stride(43)
-      .Test(xnn_qs8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-  }
-}
-
-TEST(QS8_GEMM_MINMAX_8X8C4__SCALAR, k_div_4_subtile) {
-  for (size_t k = 8; k <= 40; k += 4) {
-    for (uint32_t m = 1; m <= 8; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .mr(8)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(k)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-}
-
-TEST(QS8_GEMM_MINMAX_8X8C4__SCALAR, n_gt_8) {
-  for (uint32_t n = 9; n < 16; n++) {
-    for (size_t k = 1; k <= 20; k += 5) {
-      GemmMicrokernelTester()
-        .mr(8)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(8)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-    }
-  }
-}
-
-TEST(QS8_GEMM_MINMAX_8X8C4__SCALAR, n_gt_8_strided_cn) {
-  for (uint32_t n = 9; n < 16; n++) {
-    for (size_t k = 1; k <= 20; k += 5) {
-      GemmMicrokernelTester()
-        .mr(8)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(8)
-        .n(8)
-        .k(k)
-        .cn_stride(11)
-        .Test(xnn_qs8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-    }
-  }
-}
-
-TEST(QS8_GEMM_MINMAX_8X8C4__SCALAR, n_gt_8_strided_a) {
-  for (uint32_t n = 9; n < 16; n++) {
-    for (size_t k = 1; k <= 20; k += 5) {
-      GemmMicrokernelTester()
-        .mr(8)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(8)
-        .n(n)
-        .k(k)
-        .a_stride(23)
-        .Test(xnn_qs8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-    }
-  }
-}
-
-TEST(QS8_GEMM_MINMAX_8X8C4__SCALAR, n_gt_8_subtile) {
-  for (uint32_t n = 9; n < 16; n++) {
-    for (size_t k = 1; k <= 20; k += 5) {
-      for (uint32_t m = 1; m <= 8; m++) {
-        GemmMicrokernelTester()
-          .mr(8)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(k)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-}
-
-TEST(QS8_GEMM_MINMAX_8X8C4__SCALAR, n_div_8) {
-  for (uint32_t n = 16; n <= 24; n += 8) {
-    for (size_t k = 1; k <= 20; k += 5) {
-      GemmMicrokernelTester()
-        .mr(8)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(8)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-    }
-  }
-}
-
-TEST(QS8_GEMM_MINMAX_8X8C4__SCALAR, n_div_8_strided_cn) {
-  for (uint32_t n = 16; n <= 24; n += 8) {
-    for (size_t k = 1; k <= 20; k += 5) {
-      GemmMicrokernelTester()
-        .mr(8)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(8)
-        .n(n)
-        .k(k)
-        .cn_stride(11)
-        .Test(xnn_qs8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-    }
-  }
-}
-
-TEST(QS8_GEMM_MINMAX_8X8C4__SCALAR, n_div_8_strided_a) {
-  for (uint32_t n = 16; n <= 24; n += 8) {
-    for (size_t k = 1; k <= 20; k += 5) {
-      GemmMicrokernelTester()
-        .mr(8)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(8)
-        .n(n)
-        .k(k)
-        .a_stride(23)
-        .Test(xnn_qs8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-    }
-  }
-}
-
-TEST(QS8_GEMM_MINMAX_8X8C4__SCALAR, n_div_8_subtile) {
-  for (uint32_t n = 16; n <= 24; n += 8) {
-    for (size_t k = 1; k <= 20; k += 5) {
-      for (uint32_t m = 1; m <= 8; m++) {
-        GemmMicrokernelTester()
-          .mr(8)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(k)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-}
-
-TEST(QS8_GEMM_MINMAX_8X8C4__SCALAR, strided_cm_subtile) {
-  for (size_t k = 1; k <= 20; k += 5) {
-    for (uint32_t m = 1; m <= 8; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .mr(8)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(k)
-          .cm_stride(11)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-}
-
-TEST(QS8_GEMM_MINMAX_8X8C4__SCALAR, qmin) {
-  GemmMicrokernelTester()
-    .mr(8)
-    .nr(8)
-    .kr(4)
-    .sr(1)
-    .m(8)
-    .n(8)
-    .k(4)
-    .qmin(128)
-    .Test(xnn_qs8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-}
-
-TEST(QS8_GEMM_MINMAX_8X8C4__SCALAR, qmax) {
-  GemmMicrokernelTester()
-    .mr(8)
-    .nr(8)
-    .kr(4)
-    .sr(1)
-    .m(8)
-    .n(8)
-    .k(4)
-    .qmax(128)
-    .Test(xnn_qs8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-}
-
-TEST(QS8_GEMM_MINMAX_8X8C4__SCALAR, strided_cm) {
-  GemmMicrokernelTester()
-    .mr(8)
-    .nr(8)
-    .kr(4)
-    .sr(1)
-    .m(8)
-    .n(8)
-    .k(4)
-    .cm_stride(11)
-    .Test(xnn_qs8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-}
-
-
-TEST(QS8_GEMM_MINMAX_12X4C4__SCALAR, k_eq_4) {
-  GemmMicrokernelTester()
-    .mr(12)
-    .nr(4)
-    .kr(4)
-    .sr(1)
-    .m(12)
-    .n(4)
-    .k(4)
-    .Test(xnn_qs8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-}
-
-TEST(QS8_GEMM_MINMAX_12X4C4__SCALAR, strided_cn) {
-  GemmMicrokernelTester()
-    .mr(12)
-    .nr(4)
-    .kr(4)
-    .sr(1)
-    .m(12)
-    .n(4)
-    .k(4)
-    .cn_stride(7)
-    .Test(xnn_qs8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-}
-
-TEST(QS8_GEMM_MINMAX_12X4C4__SCALAR, k_eq_4_strided_a) {
-  GemmMicrokernelTester()
-    .mr(12)
-    .nr(4)
-    .kr(4)
-    .sr(1)
-    .m(12)
-    .n(4)
-    .k(4)
-    .a_stride(7)
-    .Test(xnn_qs8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-}
-
-TEST(QS8_GEMM_MINMAX_12X4C4__SCALAR, k_eq_4_subtile) {
-  for (uint32_t m = 1; m <= 12; m++) {
-    for (uint32_t n = 1; n <= 4; n++) {
-      GemmMicrokernelTester()
-        .mr(12)
-        .nr(4)
-        .kr(4)
-        .sr(1)
-        .m(m)
-        .n(n)
-        .k(4)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-    }
-  }
-}
-
-TEST(QS8_GEMM_MINMAX_12X4C4__SCALAR, k_eq_4_subtile_m) {
-  for (uint32_t m = 1; m <= 12; m++) {
-    GemmMicrokernelTester()
-      .mr(12)
-      .nr(4)
-      .kr(4)
-      .sr(1)
-      .m(m)
-      .n(4)
-      .k(4)
-      .iterations(1)
-      .Test(xnn_qs8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-  }
-}
-
-TEST(QS8_GEMM_MINMAX_12X4C4__SCALAR, k_eq_4_subtile_n) {
-  for (uint32_t n = 1; n <= 4; n++) {
-    GemmMicrokernelTester()
-      .mr(12)
-      .nr(4)
-      .kr(4)
-      .sr(1)
-      .m(12)
-      .n(n)
-      .k(4)
-      .iterations(1)
-      .Test(xnn_qs8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-  }
-}
-
-TEST(QS8_GEMM_MINMAX_12X4C4__SCALAR, k_lt_4) {
-  for (size_t k = 1; k < 4; k++) {
-    GemmMicrokernelTester()
-      .mr(12)
-      .nr(4)
-      .kr(4)
-      .sr(1)
-      .m(12)
-      .n(4)
-      .k(k)
-      .Test(xnn_qs8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-  }
-}
-
-TEST(QS8_GEMM_MINMAX_12X4C4__SCALAR, k_lt_4_strided_a) {
-  for (size_t k = 1; k < 4; k++) {
-    GemmMicrokernelTester()
-      .mr(12)
-      .nr(4)
-      .kr(4)
-      .sr(1)
-      .m(12)
-      .n(4)
-      .k(k)
-      .a_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-  }
-}
-
-TEST(QS8_GEMM_MINMAX_12X4C4__SCALAR, k_lt_4_subtile) {
-  for (size_t k = 1; k < 4; k++) {
-    for (uint32_t m = 1; m <= 12; m++) {
-      for (uint32_t n = 1; n <= 4; n++) {
-        GemmMicrokernelTester()
-          .mr(12)
-          .nr(4)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(k)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-}
-
-TEST(QS8_GEMM_MINMAX_12X4C4__SCALAR, k_gt_4) {
-  for (size_t k = 5; k < 8; k++) {
-    GemmMicrokernelTester()
-      .mr(12)
-      .nr(4)
-      .kr(4)
-      .sr(1)
-      .m(12)
-      .n(4)
-      .k(k)
-      .Test(xnn_qs8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-  }
-}
-
-TEST(QS8_GEMM_MINMAX_12X4C4__SCALAR, k_gt_4_strided_a) {
-  for (size_t k = 5; k < 8; k++) {
-    GemmMicrokernelTester()
-      .mr(12)
-      .nr(4)
-      .kr(4)
-      .sr(1)
-      .m(12)
-      .n(4)
-      .k(k)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-  }
-}
-
-TEST(QS8_GEMM_MINMAX_12X4C4__SCALAR, k_gt_4_subtile) {
-  for (size_t k = 5; k < 8; k++) {
-    for (uint32_t m = 1; m <= 12; m++) {
-      for (uint32_t n = 1; n <= 4; n++) {
-        GemmMicrokernelTester()
-          .mr(12)
-          .nr(4)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(k)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-}
-
-TEST(QS8_GEMM_MINMAX_12X4C4__SCALAR, k_div_4) {
-  for (size_t k = 8; k <= 40; k += 4) {
-    GemmMicrokernelTester()
-      .mr(12)
-      .nr(4)
-      .kr(4)
-      .sr(1)
-      .m(12)
-      .n(4)
-      .k(k)
-      .Test(xnn_qs8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-  }
-}
-
-TEST(QS8_GEMM_MINMAX_12X4C4__SCALAR, k_div_4_strided_a) {
-  for (size_t k = 8; k <= 40; k += 4) {
-    GemmMicrokernelTester()
-      .mr(12)
-      .nr(4)
-      .kr(4)
-      .sr(1)
-      .m(12)
-      .n(4)
-      .k(k)
-      .a_stride(43)
-      .Test(xnn_qs8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-  }
-}
-
-TEST(QS8_GEMM_MINMAX_12X4C4__SCALAR, k_div_4_subtile) {
-  for (size_t k = 8; k <= 40; k += 4) {
-    for (uint32_t m = 1; m <= 12; m++) {
-      for (uint32_t n = 1; n <= 4; n++) {
-        GemmMicrokernelTester()
-          .mr(12)
-          .nr(4)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(k)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-}
-
-TEST(QS8_GEMM_MINMAX_12X4C4__SCALAR, n_gt_4) {
-  for (uint32_t n = 5; n < 8; n++) {
-    for (size_t k = 1; k <= 20; k += 5) {
-      GemmMicrokernelTester()
-        .mr(12)
-        .nr(4)
-        .kr(4)
-        .sr(1)
-        .m(12)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-    }
-  }
-}
-
-TEST(QS8_GEMM_MINMAX_12X4C4__SCALAR, n_gt_4_strided_cn) {
-  for (uint32_t n = 5; n < 8; n++) {
-    for (size_t k = 1; k <= 20; k += 5) {
-      GemmMicrokernelTester()
-        .mr(12)
-        .nr(4)
-        .kr(4)
-        .sr(1)
-        .m(12)
-        .n(4)
-        .k(k)
-        .cn_stride(7)
-        .Test(xnn_qs8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-    }
-  }
-}
-
-TEST(QS8_GEMM_MINMAX_12X4C4__SCALAR, n_gt_4_strided_a) {
-  for (uint32_t n = 5; n < 8; n++) {
-    for (size_t k = 1; k <= 20; k += 5) {
-      GemmMicrokernelTester()
-        .mr(12)
-        .nr(4)
-        .kr(4)
-        .sr(1)
-        .m(12)
-        .n(n)
-        .k(k)
-        .a_stride(23)
-        .Test(xnn_qs8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-    }
-  }
-}
-
-TEST(QS8_GEMM_MINMAX_12X4C4__SCALAR, n_gt_4_subtile) {
-  for (uint32_t n = 5; n < 8; n++) {
-    for (size_t k = 1; k <= 20; k += 5) {
-      for (uint32_t m = 1; m <= 12; m++) {
-        GemmMicrokernelTester()
-          .mr(12)
-          .nr(4)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(k)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-}
-
-TEST(QS8_GEMM_MINMAX_12X4C4__SCALAR, n_div_4) {
-  for (uint32_t n = 8; n <= 12; n += 4) {
-    for (size_t k = 1; k <= 20; k += 5) {
-      GemmMicrokernelTester()
-        .mr(12)
-        .nr(4)
-        .kr(4)
-        .sr(1)
-        .m(12)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-    }
-  }
-}
-
-TEST(QS8_GEMM_MINMAX_12X4C4__SCALAR, n_div_4_strided_cn) {
-  for (uint32_t n = 8; n <= 12; n += 4) {
-    for (size_t k = 1; k <= 20; k += 5) {
-      GemmMicrokernelTester()
-        .mr(12)
-        .nr(4)
-        .kr(4)
-        .sr(1)
-        .m(12)
-        .n(n)
-        .k(k)
-        .cn_stride(7)
-        .Test(xnn_qs8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-    }
-  }
-}
-
-TEST(QS8_GEMM_MINMAX_12X4C4__SCALAR, n_div_4_strided_a) {
-  for (uint32_t n = 8; n <= 12; n += 4) {
-    for (size_t k = 1; k <= 20; k += 5) {
-      GemmMicrokernelTester()
-        .mr(12)
-        .nr(4)
-        .kr(4)
-        .sr(1)
-        .m(12)
-        .n(n)
-        .k(k)
-        .a_stride(23)
-        .Test(xnn_qs8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-    }
-  }
-}
-
-TEST(QS8_GEMM_MINMAX_12X4C4__SCALAR, n_div_4_subtile) {
-  for (uint32_t n = 8; n <= 12; n += 4) {
-    for (size_t k = 1; k <= 20; k += 5) {
-      for (uint32_t m = 1; m <= 12; m++) {
-        GemmMicrokernelTester()
-          .mr(12)
-          .nr(4)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(k)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-}
-
-TEST(QS8_GEMM_MINMAX_12X4C4__SCALAR, strided_cm_subtile) {
-  for (size_t k = 1; k <= 20; k += 5) {
-    for (uint32_t m = 1; m <= 12; m++) {
-      for (uint32_t n = 1; n <= 4; n++) {
-        GemmMicrokernelTester()
-          .mr(12)
-          .nr(4)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(k)
-          .cm_stride(7)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-}
-
-TEST(QS8_GEMM_MINMAX_12X4C4__SCALAR, qmin) {
-  GemmMicrokernelTester()
-    .mr(12)
-    .nr(4)
-    .kr(4)
-    .sr(1)
-    .m(12)
-    .n(4)
-    .k(4)
-    .qmin(128)
-    .Test(xnn_qs8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-}
-
-TEST(QS8_GEMM_MINMAX_12X4C4__SCALAR, qmax) {
-  GemmMicrokernelTester()
-    .mr(12)
-    .nr(4)
-    .kr(4)
-    .sr(1)
-    .m(12)
-    .n(4)
-    .k(4)
-    .qmax(128)
-    .Test(xnn_qs8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-}
-
-TEST(QS8_GEMM_MINMAX_12X4C4__SCALAR, strided_cm) {
-  GemmMicrokernelTester()
-    .mr(12)
-    .nr(4)
-    .kr(4)
-    .sr(1)
-    .m(12)
-    .n(4)
-    .k(4)
-    .cm_stride(7)
-    .Test(xnn_qs8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-}
diff --git a/test/qs8-gemm-minmax.yaml b/test/qs8-gemm-minmax.yaml
index d2bf977..e85ca93 100644
--- a/test/qs8-gemm-minmax.yaml
+++ b/test/qs8-gemm-minmax.yaml
@@ -2,14 +2,122 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+- name: xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal
+  k-block: 16
+- name: xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mull_padal
+  k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_1x8__neon_mlal_lane
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane
   k-block: 8
+- name: xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane
+  k-block: 8
+- name: xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane
+  k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane
   k-block: 8
+- name: xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane
+  k-block: 8
+- name: xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane
+  k-block: 8
+- name: xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup
+  k-block: 8
+- name: xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup
+  k-block: 8
+- name: xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup
+  k-block: 8
+- name: xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup
+  k-block: 8
+- name: xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup
+  k-block: 8
+- name: xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup
+  k-block: 8
+- name: xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup
+  k-block: 8
+- name: xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup
+  k-block: 8
+- name: xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mull_padal_dup
+  k-block: 8
+- name: xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mull_padal_dup
+  k-block: 8
+- name: xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup
+  k-block: 8
+- name: xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mull_padal_dup
+  k-block: 8
+- name: xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup
+  k-block: 8
+- name: xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup
+  k-block: 8
+- name: xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup
+  k-block: 8
+- name: xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup
+  k-block: 8
+- name: xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup
+  k-block: 16
+- name: xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup
+  k-block: 16
+- name: xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup
+  k-block: 16
+- name: xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup
+  k-block: 16
+- name: xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup
+  k-block: 16
+- name: xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup
+  k-block: 16
+- name: xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup
+  k-block: 16
+- name: xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup
+  k-block: 16
+- name: xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mull_padal
+  k-block: 8
+- name: xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mull_padal
+  k-block: 8
+- name: xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal
+  k-block: 8
+- name: xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal
+  k-block: 8
+- name: xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal
+  k-block: 8
+- name: xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal
+  k-block: 8
+- name: xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal
+  k-block: 8
+- name: xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal
+  k-block: 8
+- name: xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal
+  k-block: 16
+- name: xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal
+  k-block: 16
+- name: xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal
+  k-block: 16
+- name: xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal
+  k-block: 16
+- name: xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal
+  k-block: 16
+- name: xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal
+  k-block: 16
+- name: xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal
+  k-block: 16
+- name: xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal
+  k-block: 16
+- name: xnn_qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal
+  k-block: 16
+- name: xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal
+  k-block: 16
+- name: xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal
+  k-block: 16
+- name: xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal
+  k-block: 16
+- name: xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal
+  k-block: 16
+- name: xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal
+  k-block: 16
+- name: xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal
+  k-block: 16
+- name: xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal
+  k-block: 16
 - name: xnn_qs8_gemm_minmax_ukernel_1x8c4__neondot
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_4x8c4__neondot
@@ -18,8 +126,6 @@
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_8x8c4__neondot
   k-block: 8
-- name: xnn_qs8_gemm_minmax_ukernel_12x8c4__neondot
-  k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot
@@ -28,8 +134,14 @@
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot
   k-block: 8
+- name: xnn_qs8_gemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal
+  k-block: 16
+- name: xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld32
+  k-block: 4
 - name: xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld64
   k-block: 8
+- name: xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld32
+  k-block: 4
 - name: xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55
@@ -192,7 +304,3 @@
   k-block: 8
 - name: xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__wasmsimd
   k-block: 8
-- name: xnn_qs8_gemm_minmax_ukernel_8x8c4__scalar
-  k-block: 4
-- name: xnn_qs8_gemm_minmax_ukernel_12x4c4__scalar
-  k-block: 4
diff --git a/test/qs8-igemm-minmax.cc b/test/qs8-igemm-minmax.cc
index afaf7c2..613389f 100644
--- a/test/qs8-igemm-minmax.cc
+++ b/test/qs8-igemm-minmax.cc
@@ -22,6 +22,942 @@
 #include "gemm-microkernel-tester.h"
 
 
+#if XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__AARCH64_NEON_MLAL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(QS8_IGEMM_MINMAX_1X8__NEON_MLAL_LANE, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
@@ -959,6 +1895,942 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MLAL_LANE, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MLAL_LANE, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MLAL_LANE, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MLAL_LANE, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MLAL_LANE, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MLAL_LANE, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MLAL_LANE, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MLAL_LANE, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MLAL_LANE, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MLAL_LANE, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MLAL_LANE, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MLAL_LANE, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MLAL_LANE, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MLAL_LANE, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MLAL_LANE, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MLAL_LANE, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MLAL_LANE, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MLAL_LANE, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MLAL_LANE, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MLAL_LANE, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MLAL_LANE, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MLAL_LANE, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MLAL_LANE, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MLAL_LANE, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 4; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MLAL_LANE, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MLAL_LANE, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MLAL_LANE, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MLAL_LANE, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(8)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MLAL_LANE, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(8)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MLAL_LANE, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MLAL_LANE, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MLAL_LANE, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MLAL_LANE, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MLAL_LANE, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MLAL_LANE, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MLAL_LANE, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MLAL_LANE, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MLAL_LANE, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MLAL_LANE, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MLAL_LANE, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MLAL_LANE, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MLAL_LANE, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MLAL_LANE, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MLAL_LANE, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MLAL_LANE, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MLAL_LANE, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MLAL_LANE, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MLAL_LANE, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MLAL_LANE, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MLAL_LANE, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(127)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MLAL_LANE, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 3; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(127)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MLAL_LANE, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MLAL_LANE, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MLAL_LANE, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(8)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(QS8_IGEMM_MINMAX_1X16__NEON_MLAL_LANE, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
@@ -1895,6 +3767,23406 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MLAL_LANE, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(8)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MLAL_LANE, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(8)
+      .cn_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MLAL_LANE, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MLAL_LANE, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MLAL_LANE, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MLAL_LANE, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MLAL_LANE, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MLAL_LANE, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MLAL_LANE, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MLAL_LANE, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MLAL_LANE, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MLAL_LANE, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MLAL_LANE, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MLAL_LANE, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MLAL_LANE, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MLAL_LANE, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MLAL_LANE, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MLAL_LANE, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MLAL_LANE, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MLAL_LANE, n_gt_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MLAL_LANE, n_div_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MLAL_LANE, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MLAL_LANE, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .a_offset(127)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MLAL_LANE, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 3; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .a_offset(127)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MLAL_LANE, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MLAL_LANE, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MLAL_LANE, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(8)
+      .cm_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MLAL_LANE, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MLAL_LANE, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .cn_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MLAL_LANE, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MLAL_LANE, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MLAL_LANE, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MLAL_LANE, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MLAL_LANE, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MLAL_LANE, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MLAL_LANE, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MLAL_LANE, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MLAL_LANE, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MLAL_LANE, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MLAL_LANE, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MLAL_LANE, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MLAL_LANE, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MLAL_LANE, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MLAL_LANE, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MLAL_LANE, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MLAL_LANE, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MLAL_LANE, n_gt_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MLAL_LANE, n_div_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MLAL_LANE, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MLAL_LANE, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MLAL_LANE, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 4; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MLAL_LANE, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MLAL_LANE, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MLAL_LANE, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .cm_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MULL_PADAL, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(8)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mull_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MULL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(8)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mull_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MULL_PADAL, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MULL_PADAL, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MULL_PADAL, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MULL_PADAL, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MULL_PADAL, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MULL_PADAL, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MULL_PADAL, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MULL_PADAL, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MULL_PADAL, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MULL_PADAL, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MULL_PADAL, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MULL_PADAL, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MULL_PADAL, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MULL_PADAL, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MULL_PADAL, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MULL_PADAL, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MULL_PADAL, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MULL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MULL_PADAL, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(43)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MULL_PADAL, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(43)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MULL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mull_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MULL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mull_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MULL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(8)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mull_padal);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MULL_PADAL, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(8)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mull_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MULL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(8)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mull_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MULL_PADAL, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MULL_PADAL, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MULL_PADAL, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MULL_PADAL, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MULL_PADAL, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MULL_PADAL, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MULL_PADAL, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MULL_PADAL, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MULL_PADAL, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MULL_PADAL, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MULL_PADAL, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MULL_PADAL, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MULL_PADAL, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MULL_PADAL, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MULL_PADAL, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MULL_PADAL, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MULL_PADAL, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MULL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MULL_PADAL, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MULL_PADAL, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MULL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mull_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MULL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mull_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MULL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(8)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mull_padal);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MULL_PADAL, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(8)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MULL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(8)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MULL_PADAL, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MULL_PADAL, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MULL_PADAL, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MULL_PADAL, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MULL_PADAL, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MULL_PADAL, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MULL_PADAL, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MULL_PADAL, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MULL_PADAL, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MULL_PADAL, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MULL_PADAL, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MULL_PADAL, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MULL_PADAL, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MULL_PADAL, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MULL_PADAL, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MULL_PADAL, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MULL_PADAL, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MULL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MULL_PADAL, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(127)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MULL_PADAL, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 3; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(127)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MULL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MULL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MULL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(8)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MULL_PADAL, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MULL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MULL_PADAL, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MULL_PADAL, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MULL_PADAL, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MULL_PADAL, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MULL_PADAL, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MULL_PADAL, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MULL_PADAL, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MULL_PADAL, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MULL_PADAL, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MULL_PADAL, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MULL_PADAL, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MULL_PADAL, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MULL_PADAL, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MULL_PADAL, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MULL_PADAL, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MULL_PADAL, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MULL_PADAL, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MULL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MULL_PADAL, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MULL_PADAL, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 4; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MULL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MULL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MULL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MULL_PADAL, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(8)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MULL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(8)
+      .cn_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MULL_PADAL, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MULL_PADAL, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MULL_PADAL, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MULL_PADAL, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MULL_PADAL, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MULL_PADAL, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MULL_PADAL, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MULL_PADAL, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MULL_PADAL, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MULL_PADAL, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MULL_PADAL, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MULL_PADAL, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MULL_PADAL, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MULL_PADAL, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MULL_PADAL, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MULL_PADAL, n_gt_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MULL_PADAL, n_div_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MULL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MULL_PADAL, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .a_offset(43)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MULL_PADAL, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .a_offset(43)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MULL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MULL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MULL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(8)
+      .cm_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MULL_PADAL, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(8)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MULL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(8)
+      .cn_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MULL_PADAL, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MULL_PADAL, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MULL_PADAL, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MULL_PADAL, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MULL_PADAL, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MULL_PADAL, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MULL_PADAL, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MULL_PADAL, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MULL_PADAL, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MULL_PADAL, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MULL_PADAL, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MULL_PADAL, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MULL_PADAL, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MULL_PADAL, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MULL_PADAL, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MULL_PADAL, n_gt_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MULL_PADAL, n_div_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MULL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MULL_PADAL, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MULL_PADAL, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MULL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MULL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MULL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(8)
+      .cm_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MULL_PADAL, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(8)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MULL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(8)
+      .cn_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MULL_PADAL, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MULL_PADAL, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MULL_PADAL, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MULL_PADAL, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MULL_PADAL, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MULL_PADAL, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MULL_PADAL, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MULL_PADAL, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MULL_PADAL, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MULL_PADAL, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MULL_PADAL, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MULL_PADAL, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MULL_PADAL, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MULL_PADAL, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MULL_PADAL, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MULL_PADAL, n_gt_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MULL_PADAL, n_div_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MULL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MULL_PADAL, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .a_offset(127)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MULL_PADAL, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 3; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .a_offset(127)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MULL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MULL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MULL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(8)
+      .cm_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MULL_PADAL, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MULL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .cn_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MULL_PADAL, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MULL_PADAL, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MULL_PADAL, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MULL_PADAL, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MULL_PADAL, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MULL_PADAL, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MULL_PADAL, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MULL_PADAL, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MULL_PADAL, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MULL_PADAL, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MULL_PADAL, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MULL_PADAL, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MULL_PADAL, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MULL_PADAL, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MULL_PADAL, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MULL_PADAL, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MULL_PADAL, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MULL_PADAL, n_gt_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MULL_PADAL, n_div_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MULL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MULL_PADAL, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MULL_PADAL, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 4; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MULL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MULL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MULL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .cm_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C8__NEON_MLAL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C8__NEON_MLAL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(16)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(251)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 3; mz++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(251)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C8__NEON_MLAL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(16)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(16)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(331)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 4; mz++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(331)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C8__NEON_MLAL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(16)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(16)
+      .cn_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, n_gt_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, n_div_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C8__NEON_MLAL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(16)
+      .cm_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(16)
+      .cn_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, n_gt_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, n_div_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C8__NEON_MLAL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(16)
+      .cm_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(16)
+      .cn_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, n_gt_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, n_div_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .a_offset(251)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 3; mz++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .a_offset(251)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C8__NEON_MLAL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(16)
+      .cm_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .cn_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, n_gt_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, n_div_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .a_offset(331)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 4; mz++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .a_offset(331)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C8__NEON_MLAL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .cm_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C16__NEON_MLAL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x8c16__neon_mlal_padal);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C16__NEON_MLAL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(16)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(251)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 3; mz++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(251)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C16__NEON_MLAL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(16)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(16)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(331)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 4; mz++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(331)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C16__NEON_MLAL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(16)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(16)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(16)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(16)
+      .cn_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, n_gt_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, n_div_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(16)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(16)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C16__NEON_MLAL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(16)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(16)
+      .cm_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(16)
+      .cn_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, n_gt_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, n_div_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C16__NEON_MLAL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(16)
+      .cm_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(16)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(16)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(16)
+      .cn_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, n_gt_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, n_div_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .a_offset(251)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 3; mz++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .a_offset(251)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(16)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(16)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C16__NEON_MLAL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(16)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(16)
+      .cm_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(16)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(16)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .cn_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, n_gt_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, n_div_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(16)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .a_offset(331)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 4; mz++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(16)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .a_offset(331)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(16)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(16)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C16__NEON_MLAL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(16)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .cm_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(8)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(8)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(43)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(43)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(8)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(8)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(8)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(8)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(8)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(8)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(127)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 3; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(127)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(8)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 4; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(8)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(8)
+      .cn_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, n_gt_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, n_div_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .a_offset(43)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .a_offset(43)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(8)
+      .cm_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(8)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(8)
+      .cn_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, n_gt_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, n_div_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(8)
+      .cm_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(8)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(8)
+      .cn_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, n_gt_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, n_div_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .a_offset(127)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 3; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .a_offset(127)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(8)
+      .cm_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .cn_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, n_gt_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, n_div_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 4; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .cm_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(16)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(251)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 3; mz++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(251)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(16)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(16)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(331)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 4; mz++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(331)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(16)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(16)
+      .cn_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, n_div_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(16)
+      .cm_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(16)
+      .cn_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, n_div_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(16)
+      .cm_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(16)
+      .cn_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, n_div_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .a_offset(251)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 3; mz++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .a_offset(251)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(16)
+      .cm_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .cn_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, n_div_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .a_offset(331)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 4; mz++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .a_offset(331)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .cm_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(8)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(8)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(43)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(43)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X8__NEON_MULL_ADDW_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(8)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(8)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(8)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X8__NEON_MULL_ADDW_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(8)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(8)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(8)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(127)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 3; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(3)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(127)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X8__NEON_MULL_ADDW_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(8)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 4; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X8__NEON_MULL_ADDW_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(8)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(8)
+      .cn_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, n_gt_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, n_div_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .a_offset(43)
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .a_offset(43)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_1X16__NEON_MULL_ADDW_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(8)
+      .cm_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(8)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(8)
+      .cn_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, n_gt_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, n_div_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_2X16__NEON_MULL_ADDW_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(8)
+      .cm_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(8)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(8)
+      .cn_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, n_gt_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, n_div_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .a_offset(127)
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 3; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .a_offset(127)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_3X16__NEON_MULL_ADDW_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(8)
+      .cm_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .cn_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, n_gt_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, n_div_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 4; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16__NEON_MULL_ADDW_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .cm_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(QS8_IGEMM_MINMAX_1X8C4__NEONDOT, k_eq_8) {
     TEST_REQUIRES_ARM_NEON_DOT;
     GemmMicrokernelTester()
@@ -3767,474 +29039,6 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_12X8C4__NEONDOT, k_eq_8) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(12)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(12)
-      .n(8)
-      .k(8)
-      .Test(xnn_qs8_igemm_minmax_ukernel_12x8c4__neondot);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_12X8C4__NEONDOT, strided_cn) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(12)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(12)
-      .n(8)
-      .k(8)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_ukernel_12x8c4__neondot);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_12X8C4__NEONDOT, k_eq_8_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t m = 1; m <= 12; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .mr(12)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_ukernel_12x8c4__neondot);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_12X8C4__NEONDOT, k_eq_8_subtile_m) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t m = 1; m <= 12; m++) {
-      GemmMicrokernelTester()
-        .mr(12)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_12x8c4__neondot);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_12X8C4__NEONDOT, k_eq_8_subtile_n) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(12)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(12)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_12x8c4__neondot);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_12X8C4__NEONDOT, k_lt_8) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(12)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(12)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_12x8c4__neondot);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_12X8C4__NEONDOT, k_lt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 12; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(12)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_12x8c4__neondot);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_12X8C4__NEONDOT, k_gt_8) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(12)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(12)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_12x8c4__neondot);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_12X8C4__NEONDOT, k_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 12; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(12)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_12x8c4__neondot);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_12X8C4__NEONDOT, k_div_8) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(12)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(12)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_12x8c4__neondot);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_12X8C4__NEONDOT, k_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 12; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(12)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_12x8c4__neondot);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_12X8C4__NEONDOT, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(12)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(12)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_12x8c4__neondot);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_12X8C4__NEONDOT, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(12)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(12)
-          .n(8)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_ukernel_12x8c4__neondot);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_12X8C4__NEONDOT, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 12; m++) {
-          GemmMicrokernelTester()
-            .mr(12)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_12x8c4__neondot);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_12X8C4__NEONDOT, n_div_8) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(12)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(12)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_12x8c4__neondot);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_12X8C4__NEONDOT, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(12)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(12)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_ukernel_12x8c4__neondot);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_12X8C4__NEONDOT, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 12; m++) {
-          GemmMicrokernelTester()
-            .mr(12)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_12x8c4__neondot);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_12X8C4__NEONDOT, small_kernel) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(12)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(12)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_ukernel_12x8c4__neondot);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_12X8C4__NEONDOT, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 12; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(12)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_12x8c4__neondot);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_12X8C4__NEONDOT, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(12)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(12)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_12x8c4__neondot);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_12X8C4__NEONDOT, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(12)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(12)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_12x8c4__neondot);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_12X8C4__NEONDOT, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 12; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(12)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_12x8c4__neondot);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_12X8C4__NEONDOT, a_offset) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(12)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(12)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(487)
-        .Test(xnn_qs8_igemm_minmax_ukernel_12x8c4__neondot);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_12X8C4__NEONDOT, zero) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t mz = 0; mz < 12; mz++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(12)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(12)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(487)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_ukernel_12x8c4__neondot);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_12X8C4__NEONDOT, qmin) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(12)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(12)
-      .n(8)
-      .k(8)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_12x8c4__neondot);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_12X8C4__NEONDOT, qmax) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(12)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(12)
-      .n(8)
-      .k(8)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_12x8c4__neondot);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_12X8C4__NEONDOT, strided_cm) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(12)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(12)
-      .n(8)
-      .k(8)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_ukernel_12x8c4__neondot);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(QS8_IGEMM_MINMAX_1X16C4__NEONDOT, k_eq_8) {
     TEST_REQUIRES_ARM_NEON_DOT;
     GemmMicrokernelTester()
@@ -6106,6 +30910,942 @@
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
+#if XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD64, strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .cn_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD64, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD64, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD64, k_div_8) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD64, n_div_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD64, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD64, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD64, small_kernel) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD64, n_div_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD64, a_offset) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD64, zero) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t mz = 0; mz < 4; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD64, qmin) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD64, qmax) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_LD64, strided_cm) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .cm_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_CORTEX_A55, strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .cn_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_div_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_div_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_CORTEX_A55, small_kernel) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_CORTEX_A55, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_div_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_CORTEX_A55, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_CORTEX_A55, a_offset) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .a_offset(331)
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_CORTEX_A55, zero) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t mz = 0; mz < 4; mz++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .a_offset(331)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_CORTEX_A55, qmin) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_CORTEX_A55, qmax) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_4X16C4__AARCH64_NEONDOT_CORTEX_A55, strided_cm) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .cm_stride(19)
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(QS8_IGEMM_MINMAX_1X4C2__SSE2_LD64, k_eq_8) {
     TEST_REQUIRES_X86_SSE2;
diff --git a/test/qs8-igemm-minmax.yaml b/test/qs8-igemm-minmax.yaml
index 4761cc3..b658a02 100644
--- a/test/qs8-igemm-minmax.yaml
+++ b/test/qs8-igemm-minmax.yaml
@@ -2,14 +2,122 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+- name: xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal
+  k-block: 16
+- name: xnn_qs8_igemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal
+  k-block: 16
 - name: xnn_qs8_igemm_minmax_ukernel_1x8__neon_mlal_lane
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_2x8__neon_mlal_lane
   k-block: 8
+- name: xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane
+  k-block: 8
+- name: xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane
+  k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane
   k-block: 8
+- name: xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane
+  k-block: 8
+- name: xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane
+  k-block: 8
+- name: xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mull_padal
+  k-block: 8
+- name: xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mull_padal
+  k-block: 8
+- name: xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal
+  k-block: 8
+- name: xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal
+  k-block: 8
+- name: xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal
+  k-block: 8
+- name: xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal
+  k-block: 8
+- name: xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal
+  k-block: 8
+- name: xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal
+  k-block: 8
+- name: xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal
+  k-block: 16
+- name: xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal
+  k-block: 16
+- name: xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal
+  k-block: 16
+- name: xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal
+  k-block: 16
+- name: xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal
+  k-block: 16
+- name: xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal
+  k-block: 16
+- name: xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal
+  k-block: 16
+- name: xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal
+  k-block: 16
+- name: xnn_qs8_igemm_minmax_ukernel_1x8c16__neon_mlal_padal
+  k-block: 16
+- name: xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal
+  k-block: 16
+- name: xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal
+  k-block: 16
+- name: xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal
+  k-block: 16
+- name: xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal
+  k-block: 16
+- name: xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal
+  k-block: 16
+- name: xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal
+  k-block: 16
+- name: xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal
+  k-block: 16
+- name: xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup
+  k-block: 8
+- name: xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup
+  k-block: 8
+- name: xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup
+  k-block: 8
+- name: xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup
+  k-block: 8
+- name: xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup
+  k-block: 8
+- name: xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup
+  k-block: 8
+- name: xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup
+  k-block: 8
+- name: xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup
+  k-block: 8
+- name: xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup
+  k-block: 16
+- name: xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup
+  k-block: 16
+- name: xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup
+  k-block: 16
+- name: xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup
+  k-block: 16
+- name: xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup
+  k-block: 16
+- name: xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup
+  k-block: 16
+- name: xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup
+  k-block: 16
+- name: xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup
+  k-block: 16
+- name: xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup
+  k-block: 8
+- name: xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup
+  k-block: 8
+- name: xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup
+  k-block: 8
+- name: xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup
+  k-block: 8
+- name: xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup
+  k-block: 8
+- name: xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup
+  k-block: 8
+- name: xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup
+  k-block: 8
+- name: xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup
+  k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_1x8c4__neondot
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_4x8c4__neondot
@@ -18,8 +126,6 @@
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_8x8c4__neondot
   k-block: 8
-- name: xnn_qs8_igemm_minmax_ukernel_12x8c4__neondot
-  k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot
@@ -28,6 +134,10 @@
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot
   k-block: 8
+- name: xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64
+  k-block: 8
+- name: xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55
+  k-block: 16
 - name: xnn_qs8_igemm_minmax_ukernel_1x4c2__sse2_ld64
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_4x4c2__sse2_ld64
diff --git a/test/qu8-gemm-minmax.cc b/test/qu8-gemm-minmax.cc
index 41ecfde..8efc065 100644
--- a/test/qu8-gemm-minmax.cc
+++ b/test/qu8-gemm-minmax.cc
@@ -2416,948 +2416,4 @@
       .b_zero_point(0)
       .Test(xnn_qu8_gemm_minmax_ukernel_2x2__scalar, GemmMicrokernelTester::Variant::Scalar);
   }
-}
-
-TEST(QU8_GEMM_MINMAX_8X8C4__SCALAR, k_eq_4) {
-  GemmMicrokernelTester()
-    .mr(8)
-    .nr(8)
-    .kr(4)
-    .sr(1)
-    .m(8)
-    .n(8)
-    .k(4)
-    .Test(xnn_qu8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-}
-
-TEST(QU8_GEMM_MINMAX_8X8C4__SCALAR, strided_cn) {
-  GemmMicrokernelTester()
-    .mr(8)
-    .nr(8)
-    .kr(4)
-    .sr(1)
-    .m(8)
-    .n(8)
-    .k(4)
-    .cn_stride(11)
-    .Test(xnn_qu8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-}
-
-TEST(QU8_GEMM_MINMAX_8X8C4__SCALAR, k_eq_4_strided_a) {
-  GemmMicrokernelTester()
-    .mr(8)
-    .nr(8)
-    .kr(4)
-    .sr(1)
-    .m(8)
-    .n(8)
-    .k(4)
-    .a_stride(7)
-    .Test(xnn_qu8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-}
-
-TEST(QU8_GEMM_MINMAX_8X8C4__SCALAR, k_eq_4_subtile) {
-  for (uint32_t m = 1; m <= 8; m++) {
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(8)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(m)
-        .n(n)
-        .k(4)
-        .iterations(1)
-        .Test(xnn_qu8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-    }
-  }
-}
-
-TEST(QU8_GEMM_MINMAX_8X8C4__SCALAR, k_eq_4_subtile_m) {
-  for (uint32_t m = 1; m <= 8; m++) {
-    GemmMicrokernelTester()
-      .mr(8)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(m)
-      .n(8)
-      .k(4)
-      .iterations(1)
-      .Test(xnn_qu8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-  }
-}
-
-TEST(QU8_GEMM_MINMAX_8X8C4__SCALAR, k_eq_4_subtile_n) {
-  for (uint32_t n = 1; n <= 8; n++) {
-    GemmMicrokernelTester()
-      .mr(8)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(8)
-      .n(n)
-      .k(4)
-      .iterations(1)
-      .Test(xnn_qu8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-  }
-}
-
-TEST(QU8_GEMM_MINMAX_8X8C4__SCALAR, k_lt_4) {
-  for (size_t k = 1; k < 4; k++) {
-    GemmMicrokernelTester()
-      .mr(8)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(8)
-      .n(8)
-      .k(k)
-      .Test(xnn_qu8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-  }
-}
-
-TEST(QU8_GEMM_MINMAX_8X8C4__SCALAR, k_lt_4_strided_a) {
-  for (size_t k = 1; k < 4; k++) {
-    GemmMicrokernelTester()
-      .mr(8)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(8)
-      .n(8)
-      .k(k)
-      .a_stride(7)
-      .Test(xnn_qu8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-  }
-}
-
-TEST(QU8_GEMM_MINMAX_8X8C4__SCALAR, k_lt_4_subtile) {
-  for (size_t k = 1; k < 4; k++) {
-    for (uint32_t m = 1; m <= 8; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .mr(8)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(k)
-          .iterations(1)
-          .Test(xnn_qu8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-}
-
-TEST(QU8_GEMM_MINMAX_8X8C4__SCALAR, k_gt_4) {
-  for (size_t k = 5; k < 8; k++) {
-    GemmMicrokernelTester()
-      .mr(8)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(8)
-      .n(8)
-      .k(k)
-      .Test(xnn_qu8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-  }
-}
-
-TEST(QU8_GEMM_MINMAX_8X8C4__SCALAR, k_gt_4_strided_a) {
-  for (size_t k = 5; k < 8; k++) {
-    GemmMicrokernelTester()
-      .mr(8)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(8)
-      .n(8)
-      .k(k)
-      .a_stride(11)
-      .Test(xnn_qu8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-  }
-}
-
-TEST(QU8_GEMM_MINMAX_8X8C4__SCALAR, k_gt_4_subtile) {
-  for (size_t k = 5; k < 8; k++) {
-    for (uint32_t m = 1; m <= 8; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .mr(8)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(k)
-          .iterations(1)
-          .Test(xnn_qu8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-}
-
-TEST(QU8_GEMM_MINMAX_8X8C4__SCALAR, k_div_4) {
-  for (size_t k = 8; k <= 40; k += 4) {
-    GemmMicrokernelTester()
-      .mr(8)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(8)
-      .n(8)
-      .k(k)
-      .Test(xnn_qu8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-  }
-}
-
-TEST(QU8_GEMM_MINMAX_8X8C4__SCALAR, k_div_4_strided_a) {
-  for (size_t k = 8; k <= 40; k += 4) {
-    GemmMicrokernelTester()
-      .mr(8)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(8)
-      .n(8)
-      .k(k)
-      .a_stride(43)
-      .Test(xnn_qu8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-  }
-}
-
-TEST(QU8_GEMM_MINMAX_8X8C4__SCALAR, k_div_4_subtile) {
-  for (size_t k = 8; k <= 40; k += 4) {
-    for (uint32_t m = 1; m <= 8; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .mr(8)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(k)
-          .iterations(1)
-          .Test(xnn_qu8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-}
-
-TEST(QU8_GEMM_MINMAX_8X8C4__SCALAR, n_gt_8) {
-  for (uint32_t n = 9; n < 16; n++) {
-    for (size_t k = 1; k <= 20; k += 5) {
-      GemmMicrokernelTester()
-        .mr(8)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(8)
-        .n(8)
-        .k(k)
-        .Test(xnn_qu8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-    }
-  }
-}
-
-TEST(QU8_GEMM_MINMAX_8X8C4__SCALAR, n_gt_8_strided_cn) {
-  for (uint32_t n = 9; n < 16; n++) {
-    for (size_t k = 1; k <= 20; k += 5) {
-      GemmMicrokernelTester()
-        .mr(8)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(8)
-        .n(8)
-        .k(k)
-        .cn_stride(11)
-        .Test(xnn_qu8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-    }
-  }
-}
-
-TEST(QU8_GEMM_MINMAX_8X8C4__SCALAR, n_gt_8_strided_a) {
-  for (uint32_t n = 9; n < 16; n++) {
-    for (size_t k = 1; k <= 20; k += 5) {
-      GemmMicrokernelTester()
-        .mr(8)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(8)
-        .n(n)
-        .k(k)
-        .a_stride(23)
-        .Test(xnn_qu8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-    }
-  }
-}
-
-TEST(QU8_GEMM_MINMAX_8X8C4__SCALAR, n_gt_8_subtile) {
-  for (uint32_t n = 9; n < 16; n++) {
-    for (size_t k = 1; k <= 20; k += 5) {
-      for (uint32_t m = 1; m <= 8; m++) {
-        GemmMicrokernelTester()
-          .mr(8)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(k)
-          .iterations(1)
-          .Test(xnn_qu8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-}
-
-TEST(QU8_GEMM_MINMAX_8X8C4__SCALAR, n_div_8) {
-  for (uint32_t n = 16; n <= 24; n += 8) {
-    for (size_t k = 1; k <= 20; k += 5) {
-      GemmMicrokernelTester()
-        .mr(8)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(8)
-        .n(8)
-        .k(k)
-        .Test(xnn_qu8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-    }
-  }
-}
-
-TEST(QU8_GEMM_MINMAX_8X8C4__SCALAR, n_div_8_strided_cn) {
-  for (uint32_t n = 16; n <= 24; n += 8) {
-    for (size_t k = 1; k <= 20; k += 5) {
-      GemmMicrokernelTester()
-        .mr(8)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(8)
-        .n(n)
-        .k(k)
-        .cn_stride(11)
-        .Test(xnn_qu8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-    }
-  }
-}
-
-TEST(QU8_GEMM_MINMAX_8X8C4__SCALAR, n_div_8_strided_a) {
-  for (uint32_t n = 16; n <= 24; n += 8) {
-    for (size_t k = 1; k <= 20; k += 5) {
-      GemmMicrokernelTester()
-        .mr(8)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(8)
-        .n(n)
-        .k(k)
-        .a_stride(23)
-        .Test(xnn_qu8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-    }
-  }
-}
-
-TEST(QU8_GEMM_MINMAX_8X8C4__SCALAR, n_div_8_subtile) {
-  for (uint32_t n = 16; n <= 24; n += 8) {
-    for (size_t k = 1; k <= 20; k += 5) {
-      for (uint32_t m = 1; m <= 8; m++) {
-        GemmMicrokernelTester()
-          .mr(8)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(k)
-          .iterations(1)
-          .Test(xnn_qu8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-}
-
-TEST(QU8_GEMM_MINMAX_8X8C4__SCALAR, strided_cm_subtile) {
-  for (size_t k = 1; k <= 20; k += 5) {
-    for (uint32_t m = 1; m <= 8; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .mr(8)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(k)
-          .cm_stride(11)
-          .iterations(1)
-          .Test(xnn_qu8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-}
-
-TEST(QU8_GEMM_MINMAX_8X8C4__SCALAR, qmin) {
-  GemmMicrokernelTester()
-    .mr(8)
-    .nr(8)
-    .kr(4)
-    .sr(1)
-    .m(8)
-    .n(8)
-    .k(4)
-    .qmin(128)
-    .Test(xnn_qu8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-}
-
-TEST(QU8_GEMM_MINMAX_8X8C4__SCALAR, qmax) {
-  GemmMicrokernelTester()
-    .mr(8)
-    .nr(8)
-    .kr(4)
-    .sr(1)
-    .m(8)
-    .n(8)
-    .k(4)
-    .qmax(128)
-    .Test(xnn_qu8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-}
-
-TEST(QU8_GEMM_MINMAX_8X8C4__SCALAR, strided_cm) {
-  GemmMicrokernelTester()
-    .mr(8)
-    .nr(8)
-    .kr(4)
-    .sr(1)
-    .m(8)
-    .n(8)
-    .k(4)
-    .cm_stride(11)
-    .Test(xnn_qu8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-}
-
-TEST(QU8_GEMM_MINMAX_8X8C4__SCALAR, no_a_zero_point) {
-  for (size_t k = 1; k <= 20; k += 5) {
-    GemmMicrokernelTester()
-      .mr(8)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(8)
-      .n(8)
-      .k(k)
-      .a_zero_point(0)
-      .Test(xnn_qu8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-  }
-}
-
-TEST(QU8_GEMM_MINMAX_8X8C4__SCALAR, no_b_zero_point) {
-  for (size_t k = 1; k <= 20; k += 5) {
-    GemmMicrokernelTester()
-      .mr(8)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(8)
-      .n(8)
-      .k(k)
-      .b_zero_point(0)
-      .Test(xnn_qu8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-  }
-}
-
-TEST(QU8_GEMM_MINMAX_8X8C4__SCALAR, no_zero_point) {
-  for (size_t k = 1; k <= 20; k += 5) {
-    GemmMicrokernelTester()
-      .mr(8)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(8)
-      .n(8)
-      .k(k)
-      .a_zero_point(0)
-      .b_zero_point(0)
-      .Test(xnn_qu8_gemm_minmax_ukernel_8x8c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-  }
-}
-
-TEST(QU8_GEMM_MINMAX_12X4C4__SCALAR, k_eq_4) {
-  GemmMicrokernelTester()
-    .mr(12)
-    .nr(4)
-    .kr(4)
-    .sr(1)
-    .m(12)
-    .n(4)
-    .k(4)
-    .Test(xnn_qu8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-}
-
-TEST(QU8_GEMM_MINMAX_12X4C4__SCALAR, strided_cn) {
-  GemmMicrokernelTester()
-    .mr(12)
-    .nr(4)
-    .kr(4)
-    .sr(1)
-    .m(12)
-    .n(4)
-    .k(4)
-    .cn_stride(7)
-    .Test(xnn_qu8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-}
-
-TEST(QU8_GEMM_MINMAX_12X4C4__SCALAR, k_eq_4_strided_a) {
-  GemmMicrokernelTester()
-    .mr(12)
-    .nr(4)
-    .kr(4)
-    .sr(1)
-    .m(12)
-    .n(4)
-    .k(4)
-    .a_stride(7)
-    .Test(xnn_qu8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-}
-
-TEST(QU8_GEMM_MINMAX_12X4C4__SCALAR, k_eq_4_subtile) {
-  for (uint32_t m = 1; m <= 12; m++) {
-    for (uint32_t n = 1; n <= 4; n++) {
-      GemmMicrokernelTester()
-        .mr(12)
-        .nr(4)
-        .kr(4)
-        .sr(1)
-        .m(m)
-        .n(n)
-        .k(4)
-        .iterations(1)
-        .Test(xnn_qu8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-    }
-  }
-}
-
-TEST(QU8_GEMM_MINMAX_12X4C4__SCALAR, k_eq_4_subtile_m) {
-  for (uint32_t m = 1; m <= 12; m++) {
-    GemmMicrokernelTester()
-      .mr(12)
-      .nr(4)
-      .kr(4)
-      .sr(1)
-      .m(m)
-      .n(4)
-      .k(4)
-      .iterations(1)
-      .Test(xnn_qu8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-  }
-}
-
-TEST(QU8_GEMM_MINMAX_12X4C4__SCALAR, k_eq_4_subtile_n) {
-  for (uint32_t n = 1; n <= 4; n++) {
-    GemmMicrokernelTester()
-      .mr(12)
-      .nr(4)
-      .kr(4)
-      .sr(1)
-      .m(12)
-      .n(n)
-      .k(4)
-      .iterations(1)
-      .Test(xnn_qu8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-  }
-}
-
-TEST(QU8_GEMM_MINMAX_12X4C4__SCALAR, k_lt_4) {
-  for (size_t k = 1; k < 4; k++) {
-    GemmMicrokernelTester()
-      .mr(12)
-      .nr(4)
-      .kr(4)
-      .sr(1)
-      .m(12)
-      .n(4)
-      .k(k)
-      .Test(xnn_qu8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-  }
-}
-
-TEST(QU8_GEMM_MINMAX_12X4C4__SCALAR, k_lt_4_strided_a) {
-  for (size_t k = 1; k < 4; k++) {
-    GemmMicrokernelTester()
-      .mr(12)
-      .nr(4)
-      .kr(4)
-      .sr(1)
-      .m(12)
-      .n(4)
-      .k(k)
-      .a_stride(7)
-      .Test(xnn_qu8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-  }
-}
-
-TEST(QU8_GEMM_MINMAX_12X4C4__SCALAR, k_lt_4_subtile) {
-  for (size_t k = 1; k < 4; k++) {
-    for (uint32_t m = 1; m <= 12; m++) {
-      for (uint32_t n = 1; n <= 4; n++) {
-        GemmMicrokernelTester()
-          .mr(12)
-          .nr(4)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(k)
-          .iterations(1)
-          .Test(xnn_qu8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-}
-
-TEST(QU8_GEMM_MINMAX_12X4C4__SCALAR, k_gt_4) {
-  for (size_t k = 5; k < 8; k++) {
-    GemmMicrokernelTester()
-      .mr(12)
-      .nr(4)
-      .kr(4)
-      .sr(1)
-      .m(12)
-      .n(4)
-      .k(k)
-      .Test(xnn_qu8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-  }
-}
-
-TEST(QU8_GEMM_MINMAX_12X4C4__SCALAR, k_gt_4_strided_a) {
-  for (size_t k = 5; k < 8; k++) {
-    GemmMicrokernelTester()
-      .mr(12)
-      .nr(4)
-      .kr(4)
-      .sr(1)
-      .m(12)
-      .n(4)
-      .k(k)
-      .a_stride(11)
-      .Test(xnn_qu8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-  }
-}
-
-TEST(QU8_GEMM_MINMAX_12X4C4__SCALAR, k_gt_4_subtile) {
-  for (size_t k = 5; k < 8; k++) {
-    for (uint32_t m = 1; m <= 12; m++) {
-      for (uint32_t n = 1; n <= 4; n++) {
-        GemmMicrokernelTester()
-          .mr(12)
-          .nr(4)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(k)
-          .iterations(1)
-          .Test(xnn_qu8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-}
-
-TEST(QU8_GEMM_MINMAX_12X4C4__SCALAR, k_div_4) {
-  for (size_t k = 8; k <= 40; k += 4) {
-    GemmMicrokernelTester()
-      .mr(12)
-      .nr(4)
-      .kr(4)
-      .sr(1)
-      .m(12)
-      .n(4)
-      .k(k)
-      .Test(xnn_qu8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-  }
-}
-
-TEST(QU8_GEMM_MINMAX_12X4C4__SCALAR, k_div_4_strided_a) {
-  for (size_t k = 8; k <= 40; k += 4) {
-    GemmMicrokernelTester()
-      .mr(12)
-      .nr(4)
-      .kr(4)
-      .sr(1)
-      .m(12)
-      .n(4)
-      .k(k)
-      .a_stride(43)
-      .Test(xnn_qu8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-  }
-}
-
-TEST(QU8_GEMM_MINMAX_12X4C4__SCALAR, k_div_4_subtile) {
-  for (size_t k = 8; k <= 40; k += 4) {
-    for (uint32_t m = 1; m <= 12; m++) {
-      for (uint32_t n = 1; n <= 4; n++) {
-        GemmMicrokernelTester()
-          .mr(12)
-          .nr(4)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(k)
-          .iterations(1)
-          .Test(xnn_qu8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-}
-
-TEST(QU8_GEMM_MINMAX_12X4C4__SCALAR, n_gt_4) {
-  for (uint32_t n = 5; n < 8; n++) {
-    for (size_t k = 1; k <= 20; k += 5) {
-      GemmMicrokernelTester()
-        .mr(12)
-        .nr(4)
-        .kr(4)
-        .sr(1)
-        .m(12)
-        .n(4)
-        .k(k)
-        .Test(xnn_qu8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-    }
-  }
-}
-
-TEST(QU8_GEMM_MINMAX_12X4C4__SCALAR, n_gt_4_strided_cn) {
-  for (uint32_t n = 5; n < 8; n++) {
-    for (size_t k = 1; k <= 20; k += 5) {
-      GemmMicrokernelTester()
-        .mr(12)
-        .nr(4)
-        .kr(4)
-        .sr(1)
-        .m(12)
-        .n(4)
-        .k(k)
-        .cn_stride(7)
-        .Test(xnn_qu8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-    }
-  }
-}
-
-TEST(QU8_GEMM_MINMAX_12X4C4__SCALAR, n_gt_4_strided_a) {
-  for (uint32_t n = 5; n < 8; n++) {
-    for (size_t k = 1; k <= 20; k += 5) {
-      GemmMicrokernelTester()
-        .mr(12)
-        .nr(4)
-        .kr(4)
-        .sr(1)
-        .m(12)
-        .n(n)
-        .k(k)
-        .a_stride(23)
-        .Test(xnn_qu8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-    }
-  }
-}
-
-TEST(QU8_GEMM_MINMAX_12X4C4__SCALAR, n_gt_4_subtile) {
-  for (uint32_t n = 5; n < 8; n++) {
-    for (size_t k = 1; k <= 20; k += 5) {
-      for (uint32_t m = 1; m <= 12; m++) {
-        GemmMicrokernelTester()
-          .mr(12)
-          .nr(4)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(k)
-          .iterations(1)
-          .Test(xnn_qu8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-}
-
-TEST(QU8_GEMM_MINMAX_12X4C4__SCALAR, n_div_4) {
-  for (uint32_t n = 8; n <= 12; n += 4) {
-    for (size_t k = 1; k <= 20; k += 5) {
-      GemmMicrokernelTester()
-        .mr(12)
-        .nr(4)
-        .kr(4)
-        .sr(1)
-        .m(12)
-        .n(4)
-        .k(k)
-        .Test(xnn_qu8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-    }
-  }
-}
-
-TEST(QU8_GEMM_MINMAX_12X4C4__SCALAR, n_div_4_strided_cn) {
-  for (uint32_t n = 8; n <= 12; n += 4) {
-    for (size_t k = 1; k <= 20; k += 5) {
-      GemmMicrokernelTester()
-        .mr(12)
-        .nr(4)
-        .kr(4)
-        .sr(1)
-        .m(12)
-        .n(n)
-        .k(k)
-        .cn_stride(7)
-        .Test(xnn_qu8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-    }
-  }
-}
-
-TEST(QU8_GEMM_MINMAX_12X4C4__SCALAR, n_div_4_strided_a) {
-  for (uint32_t n = 8; n <= 12; n += 4) {
-    for (size_t k = 1; k <= 20; k += 5) {
-      GemmMicrokernelTester()
-        .mr(12)
-        .nr(4)
-        .kr(4)
-        .sr(1)
-        .m(12)
-        .n(n)
-        .k(k)
-        .a_stride(23)
-        .Test(xnn_qu8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-    }
-  }
-}
-
-TEST(QU8_GEMM_MINMAX_12X4C4__SCALAR, n_div_4_subtile) {
-  for (uint32_t n = 8; n <= 12; n += 4) {
-    for (size_t k = 1; k <= 20; k += 5) {
-      for (uint32_t m = 1; m <= 12; m++) {
-        GemmMicrokernelTester()
-          .mr(12)
-          .nr(4)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(k)
-          .iterations(1)
-          .Test(xnn_qu8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-}
-
-TEST(QU8_GEMM_MINMAX_12X4C4__SCALAR, strided_cm_subtile) {
-  for (size_t k = 1; k <= 20; k += 5) {
-    for (uint32_t m = 1; m <= 12; m++) {
-      for (uint32_t n = 1; n <= 4; n++) {
-        GemmMicrokernelTester()
-          .mr(12)
-          .nr(4)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(k)
-          .cm_stride(7)
-          .iterations(1)
-          .Test(xnn_qu8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-}
-
-TEST(QU8_GEMM_MINMAX_12X4C4__SCALAR, qmin) {
-  GemmMicrokernelTester()
-    .mr(12)
-    .nr(4)
-    .kr(4)
-    .sr(1)
-    .m(12)
-    .n(4)
-    .k(4)
-    .qmin(128)
-    .Test(xnn_qu8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-}
-
-TEST(QU8_GEMM_MINMAX_12X4C4__SCALAR, qmax) {
-  GemmMicrokernelTester()
-    .mr(12)
-    .nr(4)
-    .kr(4)
-    .sr(1)
-    .m(12)
-    .n(4)
-    .k(4)
-    .qmax(128)
-    .Test(xnn_qu8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-}
-
-TEST(QU8_GEMM_MINMAX_12X4C4__SCALAR, strided_cm) {
-  GemmMicrokernelTester()
-    .mr(12)
-    .nr(4)
-    .kr(4)
-    .sr(1)
-    .m(12)
-    .n(4)
-    .k(4)
-    .cm_stride(7)
-    .Test(xnn_qu8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-}
-
-TEST(QU8_GEMM_MINMAX_12X4C4__SCALAR, no_a_zero_point) {
-  for (size_t k = 1; k <= 20; k += 5) {
-    GemmMicrokernelTester()
-      .mr(12)
-      .nr(4)
-      .kr(4)
-      .sr(1)
-      .m(12)
-      .n(4)
-      .k(k)
-      .a_zero_point(0)
-      .Test(xnn_qu8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-  }
-}
-
-TEST(QU8_GEMM_MINMAX_12X4C4__SCALAR, no_b_zero_point) {
-  for (size_t k = 1; k <= 20; k += 5) {
-    GemmMicrokernelTester()
-      .mr(12)
-      .nr(4)
-      .kr(4)
-      .sr(1)
-      .m(12)
-      .n(4)
-      .k(k)
-      .b_zero_point(0)
-      .Test(xnn_qu8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-  }
-}
-
-TEST(QU8_GEMM_MINMAX_12X4C4__SCALAR, no_zero_point) {
-  for (size_t k = 1; k <= 20; k += 5) {
-    GemmMicrokernelTester()
-      .mr(12)
-      .nr(4)
-      .kr(4)
-      .sr(1)
-      .m(12)
-      .n(4)
-      .k(k)
-      .a_zero_point(0)
-      .b_zero_point(0)
-      .Test(xnn_qu8_gemm_minmax_ukernel_12x4c4__scalar, GemmMicrokernelTester::Variant::Scalar);
-  }
 }
\ No newline at end of file
diff --git a/test/qu8-gemm-minmax.yaml b/test/qu8-gemm-minmax.yaml
index 3d2aa84..d6fb30f 100644
--- a/test/qu8-gemm-minmax.yaml
+++ b/test/qu8-gemm-minmax.yaml
@@ -12,8 +12,3 @@
   k-block: 8
 - name: xnn_qu8_gemm_minmax_ukernel_2x2__scalar
   k-block: 1
-- name: xnn_qu8_gemm_minmax_ukernel_8x8c4__scalar
-  k-block: 4
-- name: xnn_qu8_gemm_minmax_ukernel_12x4c4__scalar
-  k-block: 4
-
diff --git a/test/vbinaryc-microkernel-tester.h b/test/vbinaryc-microkernel-tester.h
index 6bb1b48..b226b36 100644
--- a/test/vbinaryc-microkernel-tester.h
+++ b/test/vbinaryc-microkernel-tester.h
@@ -90,7 +90,7 @@
   void Test(xnn_f16_vbinary_ukernel_function vbinaryc, OpType op_type) const {
     std::random_device random_device;
     auto rng = std::mt19937(random_device());
-    auto f32rng = std::bind(std::uniform_real_distribution<float>(1.0e-3f, 1.0f), rng);
+    auto f32rng = std::bind(std::uniform_real_distribution<float>(1.0e-2f, 1.0f), rng);
     auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
 
     std::vector<uint16_t> a(batch_size() + XNN_EXTRA_BYTES / sizeof(uint16_t));
diff --git a/third_party/cpuinfo.BUILD b/third_party/cpuinfo.BUILD
index f5fd5e1..128d683 100644
--- a/third_party/cpuinfo.BUILD
+++ b/third_party/cpuinfo.BUILD
@@ -103,6 +103,7 @@
         ":linux_armv7a": COMMON_SRCS + ARM_SRCS + LINUX_SRCS + LINUX_ARM32_SRCS,
         ":linux_aarch64": COMMON_SRCS + ARM_SRCS + LINUX_SRCS + LINUX_ARM64_SRCS,
         ":macos_x86_64": COMMON_SRCS + X86_SRCS + MACH_SRCS + MACH_X86_SRCS,
+        ":macos_arm64": COMMON_SRCS + MACH_SRCS + MACH_ARM_SRCS,
         ":windows_x86_64": COMMON_SRCS + X86_SRCS + WINDOWS_X86_SRCS,
         ":android_armv7": COMMON_SRCS + ARM_SRCS + LINUX_SRCS + LINUX_ARM32_SRCS + ANDROID_ARM_SRCS,
         ":android_arm64": COMMON_SRCS + ARM_SRCS + LINUX_SRCS + LINUX_ARM64_SRCS + ANDROID_ARM_SRCS,
@@ -204,6 +205,14 @@
 )
 
 config_setting(
+    name = "macos_arm64",
+    values = {
+        "apple_platform_type": "macos",
+        "cpu": "darwin_arm64",
+    },
+)
+
+config_setting(
     name = "windows_x86_64",
     values = {"cpu": "x64_windows"},
 )